From ad21c9e1f14d82b8c15ae369a8839019e3d498b4 Mon Sep 17 00:00:00 2001 From: Neo Zhang Jianyu Date: Wed, 20 Nov 2024 13:54:25 +0800 Subject: [PATCH 01/14] update rel to 4040 (#10395) Co-authored-by: arthw <14088817+arthw@users.noreply.github.com> --- docs/backend/SYCL.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index e431f51f1..8d8312e91 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -34,9 +34,10 @@ The SYCL backend would be broken by some PRs due to no online CI. The following release is verified with good quality: -|Commit ID|Tag|Release|Verified Platform| -|-|-|-|-| -|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1| +|Commit ID|Tag|Release|Verified Platform| Update date| +|-|-|-|-|-| +|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19| +|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1|| ## News From 1bacb9f62514b520bdf74ed6feb46c80508dad38 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Wed, 20 Nov 2024 01:11:00 -0600 Subject: [PATCH 02/14] vulkan: further optimize mul_mat_vec using larger loads (#10387) * vulkan: Use pipeline_robustness to disable robustness in mul_mat_vec. Add some early returns for nonexistent rows in mul_mat_vec shaders. These can only be hit when dispatching a 2D grid of workgroups. Fix the logic for the 2D grid of workgroups to round up. Enable the pipeline robustness extension if it's available, and use it to disable robustness for these pipelines. The instructions to do the bounds checking contend for the same ALU resources as the bit twiddling dequant instructions. * vulkan: Add GLSL structure aliases for quant types to allow larger loads In Vulkan it's not possible to cast pointer types, so instead you have to declare an aliased binding for the memory with a different type. This commit adds aliases for the quant formats using 16b ints, and in a few places where the struct size is a multiple of 4 also using 32b ints. Currently only q4_k's aliases are used, but others will be used in subsequent commits. * vulkan: use larger loads in q5_k and q6_k shaders. Similar to the optimization I did in q4_k recently, this vectorizes some loads and reduces the number of bit twiddling instructions. * vulkan: use larger K step per iteration in mul_mat_vec. Add vec4 dequantization functions, and use them to do K=8 per iteration in mul_mat_vec. This uses 16b loads for the quant values and 128b loads for B which helps reduce the load on the memory system. The K_PER_ITER==2 logic is still there, just for F16/F32, and really only because they support unaligned sizes. Tweak the num_iters/unrolling logic to be simpler and catch a couple missed unrolling opportunities. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 101 ++++++++----- .../vulkan-shaders/dequant_funcs.comp | 48 ++++++ .../vulkan-shaders/mul_mat_vec.comp | 82 ++++++++-- .../vulkan-shaders/mul_mat_vec_base.comp | 3 + .../vulkan-shaders/mul_mat_vec_q2_k.comp | 4 + .../vulkan-shaders/mul_mat_vec_q3_k.comp | 4 + .../vulkan-shaders/mul_mat_vec_q4_k.comp | 34 ++--- .../vulkan-shaders/mul_mat_vec_q5_k.comp | 140 ++++++++++++------ .../vulkan-shaders/mul_mat_vec_q6_k.comp | 69 ++++++--- .../src/ggml-vulkan/vulkan-shaders/types.comp | 113 +++++++++++++- .../vulkan-shaders/vulkan-shaders-gen.cpp | 6 +- 11 files changed, 457 insertions(+), 147 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 21fee2f3d..ca71da2f7 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -158,6 +158,7 @@ struct vk_device_struct { std::string name; uint64_t max_memory_allocation_size; bool fp16; + bool pipeline_robustness; vk::Device device; uint32_t vendor_id; vk_queue compute_queue; @@ -654,7 +655,7 @@ static uint32_t compile_count = 0; static std::mutex compile_count_mutex; static std::condition_variable compile_count_cond; -static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, std::vector specialization_constants, uint32_t align) { +static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, std::vector specialization_constants, uint32_t align, bool disable_robustness) { VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")"); GGML_ASSERT(parameter_count > 0); GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT @@ -724,6 +725,15 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin vk::PipelineCreateFlags(), pipeline_shader_create_info, pipeline->layout); + + vk::PipelineRobustnessCreateInfoEXT rci; + + if (device->pipeline_robustness && disable_robustness) { + rci.storageBuffers = vk::PipelineRobustnessBufferBehaviorEXT::eDisabled; + rci.uniformBuffers = vk::PipelineRobustnessBufferBehaviorEXT::eDisabled; + compute_pipeline_create_info.setPNext(&rci); + } + pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value; { @@ -1261,7 +1271,7 @@ static void ggml_vk_load_shaders(vk_device& device) { device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared(); std::vector> compiles; - auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, uint32_t align) { + auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, uint32_t align, bool disable_robustness = false) { { // wait until fewer than N compiles are in progress uint32_t N = std::max(1u, std::thread::hardware_concurrency()); @@ -1271,7 +1281,7 @@ static void ggml_vk_load_shaders(vk_device& device) { } compile_count++; } - compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align)); + compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness)); }; if (device->fp16) { @@ -1370,45 +1380,45 @@ static void ggml_vk_load_shaders(vk_device& device) { // computing two rows per workgroup is a benefit for Q4_0 -> Q5_1, but not for Q8_0. ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32", mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32", mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); // dequant shaders ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); @@ -1591,12 +1601,15 @@ static vk_device ggml_vk_get_device(size_t idx) { bool fp16_storage = false; bool fp16_compute = false; + bool pipeline_robustness = false; for (const auto& properties : ext_props) { if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { fp16_storage = true; } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { fp16_compute = true; + } else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) { + pipeline_robustness = true; } } @@ -1642,10 +1655,22 @@ static vk_device ggml_vk_get_device(size_t idx) { vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; vk11_features.pNext = &vk12_features; + VkPhysicalDevicePipelineRobustnessFeaturesEXT pl_robustness_features; + pl_robustness_features.pNext = nullptr; + pl_robustness_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_ROBUSTNESS_FEATURES_EXT; + pl_robustness_features.pipelineRobustness = VK_FALSE; + + if (pipeline_robustness) { + vk12_features.pNext = &pl_robustness_features; + device_extensions.push_back("VK_EXT_pipeline_robustness"); + } + vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2); device->fp16 = device->fp16 && vk12_features.shaderFloat16; + device->pipeline_robustness = pl_robustness_features.pipelineRobustness; + if (!vk11_features.storageBuffer16BitAccess) { std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl; throw std::runtime_error("Unsupported device"); @@ -3190,7 +3215,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& if (ne01 > max_groups_x) { groups_z = 64; - groups_x /= groups_z; + groups_x = CEIL_DIV(groups_x, groups_z); } // compute @@ -3767,7 +3792,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte if (ne01 > max_groups_x) { groups_z = 64; - groups_x /= groups_z; + groups_x = CEIL_DIV(groups_x, groups_z); } // compute diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp index d5b989735..5fc1ba4ad 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp @@ -2,6 +2,15 @@ #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require #endif +#include "types.comp" + +#if defined(A_TYPE_PACKED16) +layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];}; +#endif +#if defined(A_TYPE_PACKED32) +layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];}; +#endif + #if defined(DATA_A_F32) vec2 dequantize(uint ib, uint iqs, uint a_offset) { return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]); @@ -20,6 +29,11 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) { const uint vui = uint(data_a[a_offset + ib].qs[iqs]); return (vec2(vui & 0xF, vui >> 4) - 8.0f) * d; } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a_packed16[a_offset + ib].d); + const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); + return (vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, (vui >> 12) & 0xF) - 8.0f) * d; +} #endif #if defined(DATA_A_Q4_1) @@ -29,6 +43,12 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) { const uint vui = uint(data_a[a_offset + ib].qs[iqs]); return vec2(vui & 0xF, vui >> 4) * d + m; } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a_packed16[a_offset + ib].d); + const float m = float(data_a_packed16[a_offset + ib].m); + const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); + return vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, (vui >> 12) & 0xF) * d + m; +} #endif #if defined(DATA_A_Q5_0) @@ -39,6 +59,14 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) { const uint vui = uint(data_a[a_offset + ib].qs[iqs]); return (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f) * d; } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a_packed16[a_offset + ib].d); + const uint uint_qh = uint(data_a_packed16[a_offset + ib].qh[1]) << 16 | data_a_packed16[a_offset + ib].qh[0]; + const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); + const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10); + const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); + return (vec4(((vui >> 0) & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, ((vui >> 12) & 0xF) | qh1.y) - 16.0f) * d; +} #endif #if defined(DATA_A_Q5_1) @@ -50,6 +78,15 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) { const uint vui = uint(data_a[a_offset + ib].qs[iqs]); return vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) * d + m; } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a_packed16[a_offset + ib].d); + const float m = float(data_a_packed16[a_offset + ib].m); + const uint uint_qh = data_a_packed16[a_offset + ib].qh; + const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); + const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10); + const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); + return vec4(((vui >> 0) & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, ((vui >> 12) & 0xF) | qh1.y) * d + m; +} #endif #if defined(DATA_A_Q8_0) @@ -57,6 +94,12 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) { const float d = float(data_a[a_offset + ib].d); return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d; } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a_packed16[a_offset + ib].d); + uint32_t v0 = data_a_packed16[a_offset + ib].qs[iqs/2]; + uint32_t v1 = data_a_packed16[a_offset + ib].qs[iqs/2 + 1]; + return vec4(int8_t(v0 & 0xFF), int8_t((v0 >> 8) & 0xFF), int8_t(v1 & 0xFF), int8_t((v1 >> 8) & 0xFF)) * d; +} #endif #if defined(DATA_A_IQ4_NL) @@ -65,4 +108,9 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) { const uint vui = uint(data_a[a_offset + ib].qs[iqs]); return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d; } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a_packed16[a_offset + ib].d); + const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); + return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[(vui >> 12) & 0xF]) * d; +} #endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp index 970aac6ef..00807a060 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp @@ -3,7 +3,7 @@ #ifdef FLOAT16 #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require #endif -#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types : require #include "mul_mat_vec_base.comp" @@ -12,16 +12,48 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (constant_id = 0) const uint BLOCK_SIZE = 32; layout (constant_id = 1) const uint NUM_ROWS = 1; +#if !defined(DATA_A_F32) && !defined(DATA_A_F16) +#define K_PER_ITER 8 +#else +#define K_PER_ITER 2 +#endif + + uint a_offset, b_offset, d_offset, y_offset; shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter) { - const uint col = i*BLOCK_SIZE + 2*tid; + const uint col = i*BLOCK_SIZE + K_PER_ITER*tid; const uint iqs = (col%QUANT_K)/QUANT_R; // quant index const uint iybs = col - col%QUANT_K; // y block start index +#if K_PER_ITER == 8 +#if QUANT_R == 2 + B_TYPE_VEC4 bv02 = data_b_v4[(b_offset + iybs + iqs) / 4]; + B_TYPE_VEC4 bv13 = data_b_v4[(b_offset + iybs + iqs + y_offset) / 4]; + FLOAT_TYPE b0 = FLOAT_TYPE(bv02.x); + FLOAT_TYPE b1 = FLOAT_TYPE(bv13.x); + FLOAT_TYPE b2 = FLOAT_TYPE(bv02.y); + FLOAT_TYPE b3 = FLOAT_TYPE(bv13.y); + FLOAT_TYPE b4 = FLOAT_TYPE(bv02.z); + FLOAT_TYPE b5 = FLOAT_TYPE(bv13.z); + FLOAT_TYPE b6 = FLOAT_TYPE(bv02.w); + FLOAT_TYPE b7 = FLOAT_TYPE(bv13.w); +#else + B_TYPE_VEC4 bv0 = data_b_v4[(b_offset + iybs + iqs) / 4]; + B_TYPE_VEC4 bv1 = data_b_v4[(b_offset + iybs + iqs) / 4 + 1]; + FLOAT_TYPE b0 = FLOAT_TYPE(bv0.x); + FLOAT_TYPE b1 = FLOAT_TYPE(bv0.y); + FLOAT_TYPE b2 = FLOAT_TYPE(bv0.z); + FLOAT_TYPE b3 = FLOAT_TYPE(bv0.w); + FLOAT_TYPE b4 = FLOAT_TYPE(bv1.x); + FLOAT_TYPE b5 = FLOAT_TYPE(bv1.y); + FLOAT_TYPE b6 = FLOAT_TYPE(bv1.z); + FLOAT_TYPE b7 = FLOAT_TYPE(bv1.w); +#endif +#else // Check if the second of the pair of elements is OOB, and don't fetch B or // accumulate it. We still fetch a pair of elements for A, which is fine for // quantized formats since they'll be within the same block. We should @@ -34,9 +66,24 @@ void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_ if (!OOB) { b1 = FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]); } +#endif [[unroll]] for (uint n = 0; n < num_rows; ++n) { const uint ib = ((first_row + n)*p.ncols + col)/QUANT_K; // block index +#if K_PER_ITER == 8 + const vec4 v = dequantize4(ib, iqs, a_offset); + const vec4 v2 = dequantize4(ib, iqs+(4/QUANT_R), a_offset); + + // matrix multiplication + temp[n] = fma(FLOAT_TYPE(v.x), b0, temp[n]); + temp[n] = fma(FLOAT_TYPE(v.y), b1, temp[n]); + temp[n] = fma(FLOAT_TYPE(v.z), b2, temp[n]); + temp[n] = fma(FLOAT_TYPE(v.w), b3, temp[n]); + temp[n] = fma(FLOAT_TYPE(v2.x), b4, temp[n]); + temp[n] = fma(FLOAT_TYPE(v2.y), b5, temp[n]); + temp[n] = fma(FLOAT_TYPE(v2.z), b6, temp[n]); + temp[n] = fma(FLOAT_TYPE(v2.w), b7, temp[n]); +#else const vec2 v = dequantize(ib, iqs, a_offset); // matrix multiplication @@ -44,6 +91,7 @@ void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_ if (!OOB) { temp[n] = fma(FLOAT_TYPE(v.y), b1, temp[n]); } +#endif } } @@ -61,22 +109,33 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { temp[i] = FLOAT_TYPE(0); } - const int unroll_count = 8; - - const uint num_iters = (p.ncols >= 2*tid) ? ((p.ncols - 2*tid + BLOCK_SIZE - 1) / BLOCK_SIZE) : 0; - const uint unrolled_iters = num_iters & ~(2*unroll_count - 1); + uint num_iters = p.ncols / (K_PER_ITER * BLOCK_SIZE); + if (num_iters * K_PER_ITER * BLOCK_SIZE + K_PER_ITER*tid < p.ncols) { + num_iters++; + } + int unroll_count = 4; + uint unrolled_iters = num_iters & ~(unroll_count - 1); uint i = 0; while (i < unrolled_iters) { // Manually partially unroll the loop [[unroll]] for (uint k = 0; k < unroll_count; ++k) { - iter(temp, first_row, num_rows, tid, i, false); - i += 2; + iter(temp, first_row, num_rows, tid, i*K_PER_ITER, false); + i++; + } + } + unroll_count = 2; + unrolled_iters = num_iters & ~(unroll_count - 1); + while (i < unrolled_iters) { + // Manually partially unroll the loop + [[unroll]] for (uint k = 0; k < unroll_count; ++k) { + iter(temp, first_row, num_rows, tid, i*K_PER_ITER, false); + i++; } } while (i < num_iters) { - iter(temp, first_row, num_rows, tid, i, true); - i += 2; + iter(temp, first_row, num_rows, tid, i*K_PER_ITER, true); + i++; } // sum up partial sums and write back result @@ -106,6 +165,9 @@ void main() { if (first_row + NUM_ROWS <= p.stride_d) { compute_outputs(first_row, NUM_ROWS); } else { + if (first_row >= p.stride_d) { + return; + } compute_outputs(first_row, p.stride_d - first_row); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp index 5920bc936..8d0a57913 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp @@ -12,6 +12,9 @@ layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; +layout (binding = 1) readonly buffer BV2 {B_TYPE_VEC2 data_b_v2[];}; +layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];}; + layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; #ifdef MUL_MAT_ID layout (binding = 3) readonly buffer IDS {int data_ids[];}; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp index ec8eadcd5..e2625d32b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp @@ -9,6 +9,10 @@ shared FLOAT_TYPE tmp[32]; void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; + if (row >= p.stride_d) { + return; + } + uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp index 3ca4ad85a..a28804533 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp @@ -9,6 +9,10 @@ shared FLOAT_TYPE tmp[32]; void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; + if (row >= p.stride_d) { + return; + } + uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp index b7c9b722d..5846f2e86 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp @@ -8,30 +8,14 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; shared FLOAT_TYPE tmp[32]; -// Declare aliased versions of A and B bindings that can use 16b/32b loads for -// the quantized values, and vec4 loads for B. -struct block_q4_K_u32 -{ - f16vec2 d; - uint32_t scales[3*QUANT_K/64/4]; - uint32_t qs[QUANT_K/2/4]; -}; - -struct block_q4_K_u16 -{ - f16vec2 d; - uint16_t scales[3*QUANT_K/64/2]; - uint16_t qs[QUANT_K/2/2]; -}; - -layout (binding = 0) readonly buffer A_u32 {block_q4_K_u32 data_a_u32[];}; -layout (binding = 0) readonly buffer A_u16 {block_q4_K_u16 data_a_u16[];}; -layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];}; - // This shader assumes K_QUANTS_PER_ITERATION == 2 for alignment of loads void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; + if (row >= p.stride_d) { + return; + } + uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); @@ -64,9 +48,9 @@ void main() { const FLOAT_TYPE dall = FLOAT_TYPE(d.x); const FLOAT_TYPE dmin = FLOAT_TYPE(d.y); - uint32_t scale0_u32 = data_a_u16[ib0 + i].scales[v_im ]; - uint32_t scale4_u32 = data_a_u16[ib0 + i].scales[v_im + 2]; - uint32_t scale8_u32 = data_a_u16[ib0 + i].scales[v_im + 4]; + uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ]; + uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2]; + uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4]; uvec4 scale0 = uvec4(unpack8(scale0_u32)); uvec4 scale4 = uvec4(unpack8(scale4_u32)); uvec4 scale8 = uvec4(unpack8(scale8_u32)); @@ -80,8 +64,8 @@ void main() { const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2)); const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2)); - uint32_t qs0_u32 = data_a_u32[ib0 + i].qs[q_offset / 4]; - uint32_t qs64_u32 = data_a_u32[ib0 + i].qs[q_offset / 4 + 16]; + uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4]; + uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16]; uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F; uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp index 2306785af..22a6bfae4 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp @@ -1,5 +1,7 @@ #version 450 +#extension GL_EXT_shader_explicit_arithmetic_types : require + #include "mul_mat_vec_base.comp" layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; @@ -9,6 +11,10 @@ shared FLOAT_TYPE tmp[32]; void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; + if (row >= p.stride_d) { + return; + } + uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); @@ -31,70 +37,106 @@ void main() { const uint8_t hm1 = uint8_t(1 << (2*v_im)); const uint8_t hm2 = uint8_t(hm1 << 4); - tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp + FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += 2) { const uint y1_idx = i * QUANT_K + y_offset; const uint y2_idx = y1_idx + 128; - const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib0 + i].d.x); - const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib0 + i].d.y); + f16vec2 d = data_a[ib0 + i].d; + const FLOAT_TYPE dall = FLOAT_TYPE(d.x); + const FLOAT_TYPE dmin = FLOAT_TYPE(d.y); - const uint8_t sc0 = uint8_t( data_a[ib0 + i].scales[v_im * 2 ] & 0x3f); - const uint8_t sc1 = uint8_t( data_a[ib0 + i].scales[v_im * 2 + 1] & 0x3f); - const uint8_t sc2 = uint8_t( data_a[ib0 + i].scales[v_im * 2 + 4] & 0x3f); - const uint8_t sc3 = uint8_t( data_a[ib0 + i].scales[v_im * 2 + 5] & 0x3f); - const uint8_t sc4 = uint8_t(( data_a[ib0 + i].scales[v_im * 2 + 8] & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 ] & 0xc0) >> 2)); - const uint8_t sc5 = uint8_t(( data_a[ib0 + i].scales[v_im * 2 + 9] & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 1] & 0xc0) >> 2)); - const uint8_t sc6 = uint8_t(((data_a[ib0 + i].scales[v_im * 2 + 8] >> 4) & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 4] & 0xc0) >> 2)); - const uint8_t sc7 = uint8_t(((data_a[ib0 + i].scales[v_im * 2 + 9] >> 4) & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 5] & 0xc0) >> 2)); + uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ]; + uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2]; + uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4]; + uvec4 scale0 = uvec4(unpack8(scale0_u32)); + uvec4 scale4 = uvec4(unpack8(scale4_u32)); + uvec4 scale8 = uvec4(unpack8(scale8_u32)); - const uint8_t q4_0 = uint8_t(data_a[ib0 + i].qs[q_offset ] & 0xf); - const uint8_t q4_1 = uint8_t(data_a[ib0 + i].qs[q_offset + 1] & 0xf); - const uint8_t q4_2 = uint8_t(data_a[ib0 + i].qs[q_offset + 16] & 0xf); - const uint8_t q4_3 = uint8_t(data_a[ib0 + i].qs[q_offset + 17] & 0xf); - const uint8_t q4_4 = uint8_t(data_a[ib0 + i].qs[q_offset ] >> 4); - const uint8_t q4_5 = uint8_t(data_a[ib0 + i].qs[q_offset + 1] >> 4); - const uint8_t q4_6 = uint8_t(data_a[ib0 + i].qs[q_offset + 16] >> 4); - const uint8_t q4_7 = uint8_t(data_a[ib0 + i].qs[q_offset + 17] >> 4); - const uint8_t q4_8 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] & 0xf); - const uint8_t q4_9 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] & 0xf); - const uint8_t q4_10 = uint8_t(data_a[ib0 + i].qs[q_offset + 80] & 0xf); - const uint8_t q4_11 = uint8_t(data_a[ib0 + i].qs[q_offset + 81] & 0xf); - const uint8_t q4_12 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] >> 4); - const uint8_t q4_13 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] >> 4); - const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 80] >> 4); - const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 81] >> 4); + const uint32_t sc0 = ( scale0.x & 0x3f); + const uint32_t sc1 = ( scale0.y & 0x3f); + const uint32_t sc2 = ( scale4.x & 0x3f); + const uint32_t sc3 = ( scale4.y & 0x3f); + const uint32_t sc4 = (( scale8.x & 0x0f) | ((scale0.x & 0xc0) >> 2)); + const uint32_t sc5 = (( scale8.y & 0x0f) | ((scale0.y & 0xc0) >> 2)); + const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2)); + const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2)); + + uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16); + uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16); + + uint32_t qs0_16_u32_lo4 = qs0_16_u32 & 0x0F0F0F0F; + uint32_t qs0_16_u32_hi4 = (qs0_16_u32 >> 4) & 0x0F0F0F0F; + uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F; + uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F; + + uvec4 qs0_16_lo4 = uvec4(unpack8(qs0_16_u32_lo4)); + uvec4 qs64_80_lo4 = uvec4(unpack8(qs64_80_u32_lo4)); + uvec4 qs0_16_hi4 = uvec4(unpack8(qs0_16_u32_hi4)); + uvec4 qs64_80_hi4 = uvec4(unpack8(qs64_80_u32_hi4)); + + const uint32_t q4_0 = qs0_16_lo4.x; + const uint32_t q4_1 = qs0_16_lo4.y; + const uint32_t q4_2 = qs0_16_lo4.z; + const uint32_t q4_3 = qs0_16_lo4.w; + const uint32_t q4_4 = qs0_16_hi4.x; + const uint32_t q4_5 = qs0_16_hi4.y; + const uint32_t q4_6 = qs0_16_hi4.z; + const uint32_t q4_7 = qs0_16_hi4.w; + const uint32_t q4_8 = qs64_80_lo4.x; + const uint32_t q4_9 = qs64_80_lo4.y; + const uint32_t q4_10 = qs64_80_lo4.z; + const uint32_t q4_11 = qs64_80_lo4.w; + const uint32_t q4_12 = qs64_80_hi4.x; + const uint32_t q4_13 = qs64_80_hi4.y; + const uint32_t q4_14 = qs64_80_hi4.z; + const uint32_t q4_15 = qs64_80_hi4.w; + + B_TYPE_VEC2 by10 = data_b_v2[(b_offset + y1_idx) / 2]; + B_TYPE_VEC2 by116 = data_b_v2[(b_offset + y1_idx) / 2 + 8]; + B_TYPE_VEC2 by132 = data_b_v2[(b_offset + y1_idx) / 2 + 16]; + B_TYPE_VEC2 by148 = data_b_v2[(b_offset + y1_idx) / 2 + 24]; + B_TYPE_VEC2 by20 = data_b_v2[(b_offset + y2_idx) / 2]; + B_TYPE_VEC2 by216 = data_b_v2[(b_offset + y2_idx) / 2 + 8]; + B_TYPE_VEC2 by232 = data_b_v2[(b_offset + y2_idx) / 2 + 16]; + B_TYPE_VEC2 by248 = data_b_v2[(b_offset + y2_idx) / 2 + 24]; + + uint32_t qh0 = data_a_packed16[ib0 + i].qh[l0 / 2]; + uint32_t qh1 = qh0 >> 8; + uint32_t qh16 = data_a_packed16[ib0 + i].qh[l0 / 2 + 8]; + uint32_t qh17 = qh16 >> 8; const FLOAT_TYPE sx = - fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]), (q4_0 + (((data_a[ib0 + i].qh[l0 ] & hm1) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), (q4_1 + (((data_a[ib0 + i].qh[l0 + 1] & hm1) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 16]), (q4_2 + (((data_a[ib0 + i].qh[l0 + 16] & hm1) != 0) ? 16 : 0)), - FLOAT_TYPE(data_b[b_offset + y1_idx + 17]) * (q4_3 + (((data_a[ib0 + i].qh[l0 + 17] & hm1) != 0) ? 16 : 0))))); + fma(FLOAT_TYPE(by10.x), (q4_0 + (((qh0 & hm1) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by10.y), (q4_1 + (((qh1 & hm1) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by116.x), (q4_2 + (((qh16 & hm1) != 0) ? 16 : 0)), + FLOAT_TYPE(by116.y) * (q4_3 + (((qh17 & hm1) != 0) ? 16 : 0))))); const FLOAT_TYPE sy = - fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), (q4_4 + (((data_a[ib0 + i].qh[l0 ] & (hm1 << 1)) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), (q4_5 + (((data_a[ib0 + i].qh[l0 + 1] & (hm1 << 1)) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 48]), (q4_6 + (((data_a[ib0 + i].qh[l0 + 16] & (hm1 << 1)) != 0) ? 16 : 0)), - FLOAT_TYPE(data_b[b_offset + y1_idx + 49]) * (q4_7 + (((data_a[ib0 + i].qh[l0 + 17] & (hm1 << 1)) != 0) ? 16 : 0))))); + fma(FLOAT_TYPE(by132.x), (q4_4 + (((qh0 & (hm1 << 1)) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by132.y), (q4_5 + (((qh1 & (hm1 << 1)) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by148.x), (q4_6 + (((qh16 & (hm1 << 1)) != 0) ? 16 : 0)), + FLOAT_TYPE(by148.y) * (q4_7 + (((qh17 & (hm1 << 1)) != 0) ? 16 : 0))))); const FLOAT_TYPE sz = - fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]), (q4_8 + (((data_a[ib0 + i].qh[l0 ] & hm2) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), (q4_9 + (((data_a[ib0 + i].qh[l0 + 1] & hm2) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 16]), (q4_10 + (((data_a[ib0 + i].qh[l0 + 16] & hm2) != 0) ? 16 : 0)), - FLOAT_TYPE(data_b[b_offset + y2_idx + 17]) * (q4_11 + (((data_a[ib0 + i].qh[l0 + 17] & hm2) != 0) ? 16 : 0))))); + fma(FLOAT_TYPE(by20.x), (q4_8 + (((qh0 & hm2) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by20.y), (q4_9 + (((qh1 & hm2) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by216.x), (q4_10 + (((qh16 & hm2) != 0) ? 16 : 0)), + FLOAT_TYPE(by216.y) * (q4_11 + (((qh17 & hm2) != 0) ? 16 : 0))))); const FLOAT_TYPE sw = - fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), (q4_12 + (((data_a[ib0 + i].qh[l0 ] & (hm2 << 1)) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 33]), (q4_13 + (((data_a[ib0 + i].qh[l0 + 1] & (hm2 << 1)) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 48]), (q4_14 + (((data_a[ib0 + i].qh[l0 + 16] & (hm2 << 1)) != 0) ? 16 : 0)), - FLOAT_TYPE(data_b[b_offset + y2_idx + 49]) * (q4_15 + (((data_a[ib0 + i].qh[l0 + 17] & (hm2 << 1)) != 0) ? 16 : 0))))); + fma(FLOAT_TYPE(by232.x), (q4_12 + (((qh0 & (hm2 << 1)) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by232.y), (q4_13 + (((qh1 & (hm2 << 1)) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by248.x), (q4_14 + (((qh16 & (hm2 << 1)) != 0) ? 16 : 0)), + FLOAT_TYPE(by248.y) * (q4_15 + (((qh17 & (hm2 << 1)) != 0) ? 16 : 0))))); const FLOAT_TYPE smin = - fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 1 ]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 17]), sc2, - fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 49]), sc3, - fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 1 ]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 17]), sc6, - (FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 49])) * sc7))); - const uint tmp_idx = 16 * ix + tid; - tmp[tmp_idx] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, tmp[tmp_idx])); + fma(FLOAT_TYPE(by10.x) + FLOAT_TYPE(by10.y) + FLOAT_TYPE(by116.x) + FLOAT_TYPE(by116.y), sc2, + fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3, + fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6, + (FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7))); + temp = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp)); } + tmp[gl_LocalInvocationID.x] = temp; + // sum up partial sums and write back result barrier(); [[unroll]] for (uint s = 16; s > 0; s >>= 1) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index 95c286eeb..0b392d68d 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -1,5 +1,7 @@ #version 450 +#extension GL_EXT_shader_explicit_arithmetic_types : require + #include "mul_mat_vec_base.comp" layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; @@ -9,6 +11,10 @@ shared FLOAT_TYPE tmp[32]; void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; + if (row >= p.stride_d) { + return; + } + uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); @@ -36,41 +42,66 @@ void main() { const uint s_offset = 8*v_im + is; const uint y_offset = 128*v_im + l0; - tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp + FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { const uint y_idx = i * QUANT_K + y_offset; const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); -#if K_QUANTS_PER_ITERATION == 1 - const uint tmp_idx = 16 * ix + tid; - tmp[tmp_idx] = fma(FLOAT_TYPE(data_b[b_offset + y_idx + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x03) << 4)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x03) << 4)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x0c) << 2)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x0c) << 2)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x30) >> 0)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x30) >> 0)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0xc0) >> 2)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0xc0) >> 2)) - 32), tmp[tmp_idx])))))))); -#else + FLOAT_TYPE scales[4]; + scales[0] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]); + scales[1] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]); + scales[2] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]); + scales[3] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]); + + uint32_t ql0_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16); + uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16); + + uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F; + uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F; + uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F; + uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F; + + uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16); + uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4; + uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2; + uint32_t qh4_u32 = (qh_u32 & 0x30303030) << 0; + uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2; + + uint32_t q0_u32 = ql0_u32_lo4 | qh0_u32; + uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32; + uint32_t q2_u32 = ql0_u32_hi4 | qh4_u32; + uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32; + + uvec4 q0 = uvec4(unpack8(q0_u32)); + uvec4 q1 = uvec4(unpack8(q1_u32)); + uvec4 q2 = uvec4(unpack8(q2_u32)); + uvec4 q3 = uvec4(unpack8(q3_u32)); + + B_TYPE_VEC4 by0 = data_b_v4[(b_offset + y_idx) / 4]; + B_TYPE_VEC4 by32 = data_b_v4[(b_offset + y_idx) / 4 + 8]; + B_TYPE_VEC4 by64 = data_b_v4[(b_offset + y_idx) / 4 + 16]; + B_TYPE_VEC4 by96 = data_b_v4[(b_offset + y_idx) / 4 + 24]; + FLOAT_TYPE sum = FLOAT_TYPE(0.0); [[unroll]] for (int l = 0; l < 4; ++l) { - sum = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+ 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 0) & 3) << 4)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 2) & 3) << 4)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 4) & 3) << 4)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 6) & 3) << 4)) - 32), sum)))); + sum = fma(FLOAT_TYPE(by0[l]) * scales[0], FLOAT_TYPE(int8_t(q0[l]) - 32), + fma(FLOAT_TYPE(by32[l]) * scales[1], FLOAT_TYPE(int8_t(q1[l]) - 32), + fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32), + fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum)))); } - tmp[16 * ix + tid] += sum; -#endif + temp += sum * d; } + tmp[gl_LocalInvocationID.x] = temp; + // sum up partial sums and write back result barrier(); [[unroll]] for (uint s = 16; s > 0; s >>= 1) { if (tid < s) { tmp[tid] += tmp[tid + s]; - } + } barrier(); } if (tid == 0) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp index 21dce72fc..7a34820bc 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp @@ -1,6 +1,8 @@ -#if !defined(DATA_A_F32) && !defined(DATA_A_F16) -#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require -#endif + +#if !defined(GGML_TYPES_COMP) +#define GGML_TYPES_COMP + +#extension GL_EXT_shader_explicit_arithmetic_types : require #if defined(DATA_A_F32) #define QUANT_K 1 @@ -38,8 +40,14 @@ struct block_q4_0 float16_t d; uint8_t qs[16]; }; +struct block_q4_0_packed16 +{ + float16_t d; + uint16_t qs[16/2]; +}; #define A_TYPE block_q4_0 +#define A_TYPE_PACKED16 block_q4_0_packed16 #endif #if defined(DATA_A_Q4_1) @@ -54,7 +62,15 @@ struct block_q4_1 uint8_t qs[16]; }; +struct block_q4_1_packed16 +{ + float16_t d; + float16_t m; + uint16_t qs[16/2]; +}; + #define A_TYPE block_q4_1 +#define A_TYPE_PACKED16 block_q4_1_packed16 #endif #if defined(DATA_A_Q5_0) @@ -70,7 +86,15 @@ struct block_q5_0 uint8_t qs[16]; }; +struct block_q5_0_packed16 +{ + float16_t d; + uint16_t qh[2]; + uint16_t qs[16/2]; +}; + #define A_TYPE block_q5_0 +#define A_TYPE_PACKED16 block_q5_0_packed16 #endif #if defined(DATA_A_Q5_1) @@ -87,7 +111,16 @@ struct block_q5_1 uint8_t qs[16]; }; +struct block_q5_1_packed16 +{ + float16_t d; + float16_t m; + uint qh; + uint16_t qs[16/2]; +}; + #define A_TYPE block_q5_1 +#define A_TYPE_PACKED16 block_q5_1_packed16 #endif #if defined(DATA_A_Q8_0) @@ -100,8 +133,14 @@ struct block_q8_0 float16_t d; int8_t qs[32]; }; +struct block_q8_0_packed16 +{ + float16_t d; + uint16_t qs[32/2]; +}; #define A_TYPE block_q8_0 +#define A_TYPE_PACKED16 block_q8_0_packed16 #endif // K-quants @@ -116,7 +155,23 @@ struct block_q2_K f16vec2 d; }; +struct block_q2_K_packed16 +{ + uint16_t scales[QUANT_K/16/2]; + uint16_t qs[QUANT_K/4/2]; + f16vec2 d; +}; + +struct block_q2_K_packed32 +{ + uint32_t scales[QUANT_K/16/4]; + uint32_t qs[QUANT_K/4/4]; + f16vec2 d; +}; + #define A_TYPE block_q2_K +#define A_TYPE_PACKED16 block_q2_K_packed16 +#define A_TYPE_PACKED32 block_q2_K_packed32 #endif #if defined(DATA_A_Q3_K) @@ -131,7 +186,16 @@ struct block_q3_K float16_t d; }; +struct block_q3_K_packed16 +{ + uint16_t hmask[QUANT_K/8/2]; + uint16_t qs[QUANT_K/4/2]; + uint16_t scales[12/2]; + float16_t d; +}; + #define A_TYPE block_q3_K +#define A_TYPE_PACKED16 block_q3_K_packed16 #endif #if defined(DATA_A_Q4_K) @@ -145,7 +209,23 @@ struct block_q4_K uint8_t qs[QUANT_K/2]; }; +struct block_q4_K_packed16 +{ + f16vec2 d; + uint16_t scales[3*QUANT_K/64/2]; + uint16_t qs[QUANT_K/2/2]; +}; + +struct block_q4_K_packed32 +{ + f16vec2 d; + uint32_t scales[3*QUANT_K/64/4]; + uint32_t qs[QUANT_K/2/4]; +}; + #define A_TYPE block_q4_K +#define A_TYPE_PACKED16 block_q4_K_packed16 +#define A_TYPE_PACKED32 block_q4_K_packed32 #endif #if defined(DATA_A_Q5_K) @@ -160,7 +240,16 @@ struct block_q5_K uint8_t qs[QUANT_K/2]; }; +struct block_q5_K_packed16 +{ + f16vec2 d; + uint16_t scales[12/2]; + uint16_t qh[QUANT_K/8/2]; + uint16_t qs[QUANT_K/2/2]; +}; + #define A_TYPE block_q5_K +#define A_TYPE_PACKED16 block_q5_K_packed16 #endif #if defined(DATA_A_Q6_K) @@ -175,7 +264,16 @@ struct block_q6_K float16_t d; }; +struct block_q6_K_packed16 +{ + uint16_t ql[QUANT_K/2/2]; + uint16_t qh[QUANT_K/4/2]; + int8_t scales[QUANT_K/16]; + float16_t d; +}; + #define A_TYPE block_q6_K +#define A_TYPE_PACKED16 block_q6_K_packed16 #endif // IQuants @@ -191,10 +289,19 @@ struct block_iq4_nl uint8_t qs[QUANT_K/2]; }; +struct block_iq4_nl_packed16 +{ + float16_t d; + uint16_t qs[QUANT_K/2/2]; +}; + #define A_TYPE block_iq4_nl +#define A_TYPE_PACKED16 block_iq4_nl_packed16 const int8_t kvalues_iq4nl[16] = { int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10), int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113) }; #endif + +#endif // !defined(GGML_TYPES_COMP) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index fe3e4cb39..f75310955 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -317,10 +317,10 @@ void process_shaders() { std::string data_a_key = "DATA_A_" + to_uppercase(tname); std::string shader = (string_ends_with(tname, "_k")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp"; - string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}})); - string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}})); + string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}})); + string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}})); - string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}})); + string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}})); // Dequant shaders if (tname != "f16") { From 8fd4b7fa29c3061b2e02e897d818dfcbc593430a Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Wed, 20 Nov 2024 01:40:18 -0600 Subject: [PATCH 03/14] vulkan: copy iq4_nl LUT into shared memory (#10409) --- .../ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp | 2 ++ .../ggml-vulkan/vulkan-shaders/get_rows_quant.comp | 4 ++++ .../src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp | 4 ++++ ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp | 4 ++++ ggml/src/ggml-vulkan/vulkan-shaders/types.comp | 13 ++++++++++++- .../vulkan-shaders/vulkan-shaders-gen.cpp | 6 +++--- 6 files changed, 29 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp index 34ef3da30..8de14fc03 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp @@ -10,6 +10,8 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; void main() { const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64; + init_iq4nl_shmem(); + const uint tid = gl_LocalInvocationID.x % 64; const uint il = tid/32; const uint ir = tid%32; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp index 8d30b63c1..7f608315b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp @@ -12,6 +12,10 @@ void main() { const uint i11 = (gl_GlobalInvocationID.z)/p.ne12; const uint i12 = (gl_GlobalInvocationID.z)%p.ne12; +#if defined(DATA_A_IQ4_NL) + init_iq4nl_shmem(); +#endif + if (i00 >= p.ne00) { return; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp index 00807a060..2d5b8e466 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp @@ -161,6 +161,10 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { void main() { const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z); +#if defined(DATA_A_IQ4_NL) + init_iq4nl_shmem(); +#endif + // do NUM_ROWS at a time, unless there aren't enough remaining rows if (first_row + NUM_ROWS <= p.stride_d) { compute_outputs(first_row, NUM_ROWS); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp index fffdd1818..2ff5c4305 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp @@ -75,6 +75,10 @@ shared u16vec2 row_ids[3072]; #endif void main() { +#if defined(DATA_A_IQ4_NL) + init_iq4nl_shmem(); +#endif + #ifdef MUL_MAT_ID const uint expert_idx = gl_GlobalInvocationID.z; #else diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp index 7a34820bc..bc28e0ab8 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp @@ -298,10 +298,21 @@ struct block_iq4_nl_packed16 #define A_TYPE block_iq4_nl #define A_TYPE_PACKED16 block_iq4_nl_packed16 -const int8_t kvalues_iq4nl[16] = { +const int8_t kvalues_iq4nl_const[16] = { int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10), int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113) }; + +shared FLOAT_TYPE kvalues_iq4nl[16]; + +void init_iq4nl_shmem() +{ + // copy the table into shared memory and sync + if (gl_LocalInvocationIndex.x < 16) { + kvalues_iq4nl[gl_LocalInvocationIndex.x] = FLOAT_TYPE(kvalues_iq4nl_const[gl_LocalInvocationIndex.x]); + } + barrier(); +} #endif #endif // !defined(GGML_TYPES_COMP) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index f75310955..6bbe8e96e 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -331,11 +331,11 @@ void process_shaders() { shader = (tname == "f32" || tname == "f16") ? "get_rows.comp" : "get_rows_quant.comp"; if (tname == "f16") { - string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}); + string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}})); } else { - string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}}); + string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}})); } - string_to_spv("get_rows_" + tname + "_f32", shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}}); + string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}})); } } From fab5d30ff6729ff6ff615c41e8c0215d6bc30393 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Wed, 20 Nov 2024 12:57:53 +0100 Subject: [PATCH 04/14] llama : add .clang-format file (#10415) --- .clang-format | 161 ++++++ examples/llama-bench/llama-bench.cpp | 700 +++++++++++++++------------ 2 files changed, 550 insertions(+), 311 deletions(-) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 000000000..45232b80e --- /dev/null +++ b/.clang-format @@ -0,0 +1,161 @@ +--- +Language: Cpp +AlignAfterOpenBracket: Align +AlignArrayOfStructures: Left +AlignConsecutiveAssignments: AcrossComments +AlignConsecutiveBitFields: AcrossComments +AlignConsecutiveDeclarations: AcrossComments +AlignConsecutiveMacros: AcrossComments +# AlignConsecutiveShortCaseStatements: AcrossComments +AlignEscapedNewlines: Left # LeftWithLastLine +AlignOperands: Align +AlignTrailingComments: + Kind: Always + OverEmptyLines: 1 +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: false +# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Inline +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: Inline +AllowShortLoopsOnASingleLine: false +AlwaysBreakBeforeMultilineStrings: true +BinPackArguments: true +BinPackParameters: true # OnePerLine +BitFieldColonSpacing: Both +BreakBeforeBraces: Custom # Attach +BraceWrapping: + AfterCaseLabel: true + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: false + SplitEmptyRecord: false + SplitEmptyNamespace: false +# BreakAdjacentStringLiterals: true +BreakAfterAttributes: Never +BreakBeforeBinaryOperators: None +BreakBeforeInlineASMColon: OnlyMultiline +BreakBeforeTernaryOperators: false +# BreakBinaryOperations: Never +BreakConstructorInitializers: AfterColon +# BreakFunctionDefinitionParameters: false +BreakInheritanceList: AfterComma +BreakStringLiterals: true +# BreakTemplateDeclarations: Yes +ColumnLimit: 120 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +EmptyLineBeforeAccessModifier: Leave +EmptyLineAfterAccessModifier: Never +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^<.*\.h>' + Priority: 1 + SortPriority: 0 + - Regex: '^<.*' + Priority: 2 + SortPriority: 0 + - Regex: '.*' + Priority: 3 + SortPriority: 0 +IncludeIsMainRegex: '([-_](test|unittest))?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false +IndentCaseBlocks: true +IndentCaseLabels: true +IndentExternBlock: NoIndent +IndentGotoLabels: false +IndentPPDirectives: AfterHash +IndentWidth: 4 +IndentWrappedFunctionNames: false +InsertBraces: true # NOTE: may lead to incorrect formatting +InsertNewlineAtEOF: true +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +LambdaBodyIndentation: Signature +LineEnding: LF +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 4 +ObjCSpaceAfterProperty: true +ObjCSpaceBeforeProtocolList: true +PPIndentWidth: -1 +PackConstructorInitializers: CurrentLine +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Middle +QualifierAlignment: Left +#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict'] +RawStringFormats: + - Language: Cpp + Delimiters: + - cc + - CC + - cpp + - Cpp + - CPP + - 'c++' + - 'C++' + CanonicalDelimiter: '' +ReferenceAlignment: Middle +ReflowComments: false # IndentOnly +SeparateDefinitionBlocks: Always +SortIncludes: CaseInsensitive +SortUsingDeclarations: LexicographicNumeric +SpaceAfterCStyleCast: true +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: Never +SpacesInContainerLiterals: true +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParentheses: false +SpacesInSquareBrackets: false +SpaceBeforeSquareBrackets: false +Standard: c++17 +TabWidth: 4 +UseTab: Never +WhitespaceSensitiveMacros: ['STRINGIZE'] +... + diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 8f4e0e206..3dc84a75c 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -6,28 +6,28 @@ #include #include #include +#include #include #include -#include #include #include #include #include #include #include -#include #include +#include +#include "common.h" #include "ggml.h" #include "llama.h" -#include "common.h" #ifdef _WIN32 -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX -# define NOMINMAX -#endif -#include +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include #endif // utils @@ -36,8 +36,7 @@ static uint64_t get_time_ns() { return std::chrono::nanoseconds(clock::now().time_since_epoch()).count(); } -template -static std::string join(const std::vector & values, const std::string & delim) { +template static std::string join(const std::vector & values, const std::string & delim) { std::ostringstream str; for (size_t i = 0; i < values.size(); i++) { str << values[i]; @@ -48,38 +47,35 @@ static std::string join(const std::vector & values, const std::string & delim return str.str(); } -template -static std::vector transform_to_str(const std::vector & values, F f) { +template static std::vector transform_to_str(const std::vector & values, F f) { std::vector str_values; std::transform(values.begin(), values.end(), std::back_inserter(str_values), f); return str_values; } -template -static T avg(const std::vector & v) { +template static T avg(const std::vector & v) { if (v.empty()) { return 0; } T sum = std::accumulate(v.begin(), v.end(), T(0)); - return sum / (T)v.size(); + return sum / (T) v.size(); } -template -static T stdev(const std::vector & v) { +template static T stdev(const std::vector & v) { if (v.size() <= 1) { return 0; } - T mean = avg(v); + T mean = avg(v); T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0)); - T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1)); + T stdev = std::sqrt(sq_sum / (T) (v.size() - 1) - mean * mean * (T) v.size() / (T) (v.size() - 1)); return stdev; } static std::string get_cpu_info() { std::vector cpu_list; for (size_t i = 0; i < ggml_backend_dev_count(); i++) { - auto * dev = ggml_backend_dev_get(i); - auto dev_type = ggml_backend_dev_type(dev); + auto * dev = ggml_backend_dev_get(i); + auto dev_type = ggml_backend_dev_type(dev); if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) { cpu_list.push_back(ggml_backend_dev_description(dev)); } @@ -90,8 +86,8 @@ static std::string get_cpu_info() { static std::string get_gpu_info() { std::vector gpu_list; for (size_t i = 0; i < ggml_backend_dev_count(); i++) { - auto * dev = ggml_backend_dev_get(i); - auto dev_type = ggml_backend_dev_type(dev); + auto * dev = ggml_backend_dev_get(i); + auto dev_type = ggml_backend_dev_type(dev); if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) { gpu_list.push_back(ggml_backend_dev_description(dev)); } @@ -100,17 +96,24 @@ static std::string get_gpu_info() { } // command line params -enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL}; +enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL }; static const char * output_format_str(output_formats format) { switch (format) { - case NONE: return "none"; - case CSV: return "csv"; - case JSON: return "json"; - case JSONL: return "jsonl"; - case MARKDOWN: return "md"; - case SQL: return "sql"; - default: GGML_ABORT("invalid output format"); + case NONE: + return "none"; + case CSV: + return "csv"; + case JSON: + return "json"; + case JSONL: + return "jsonl"; + case MARKDOWN: + return "md"; + case SQL: + return "sql"; + default: + GGML_ABORT("invalid output format"); } } @@ -135,10 +138,14 @@ static bool output_format_from_str(const std::string & s, output_formats & forma static const char * split_mode_str(llama_split_mode mode) { switch (mode) { - case LLAMA_SPLIT_MODE_NONE: return "none"; - case LLAMA_SPLIT_MODE_LAYER: return "layer"; - case LLAMA_SPLIT_MODE_ROW: return "row"; - default: GGML_ABORT("invalid split mode"); + case LLAMA_SPLIT_MODE_NONE: + return "none"; + case LLAMA_SPLIT_MODE_LAYER: + return "layer"; + case LLAMA_SPLIT_MODE_ROW: + return "row"; + default: + GGML_ABORT("invalid split mode"); } } @@ -149,59 +156,59 @@ static std::string pair_str(const std::pair & p) { } struct cmd_params { - std::vector model; - std::vector n_prompt; - std::vector n_gen; + std::vector model; + std::vector n_prompt; + std::vector n_gen; std::vector> n_pg; - std::vector n_batch; - std::vector n_ubatch; - std::vector type_k; - std::vector type_v; - std::vector n_threads; - std::vector cpu_mask; - std::vector cpu_strict; - std::vector poll; - std::vector n_gpu_layers; - std::vector rpc_servers; - std::vector split_mode; - std::vector main_gpu; - std::vector no_kv_offload; - std::vector flash_attn; - std::vector> tensor_split; - std::vector use_mmap; - std::vector embeddings; - ggml_numa_strategy numa; - int reps; - ggml_sched_priority prio; - int delay; - bool verbose; - bool progress; - output_formats output_format; - output_formats output_format_stderr; + std::vector n_batch; + std::vector n_ubatch; + std::vector type_k; + std::vector type_v; + std::vector n_threads; + std::vector cpu_mask; + std::vector cpu_strict; + std::vector poll; + std::vector n_gpu_layers; + std::vector rpc_servers; + std::vector split_mode; + std::vector main_gpu; + std::vector no_kv_offload; + std::vector flash_attn; + std::vector> tensor_split; + std::vector use_mmap; + std::vector embeddings; + ggml_numa_strategy numa; + int reps; + ggml_sched_priority prio; + int delay; + bool verbose; + bool progress; + output_formats output_format; + output_formats output_format_stderr; }; static const cmd_params cmd_params_defaults = { - /* model */ {"models/7B/ggml-model-q4_0.gguf"}, - /* n_prompt */ {512}, - /* n_gen */ {128}, + /* model */ { "models/7B/ggml-model-q4_0.gguf" }, + /* n_prompt */ { 512 }, + /* n_gen */ { 128 }, /* n_pg */ {}, - /* n_batch */ {2048}, - /* n_ubatch */ {512}, - /* type_k */ {GGML_TYPE_F16}, - /* type_v */ {GGML_TYPE_F16}, - /* n_threads */ {cpu_get_num_math()}, - /* cpu_mask */ {"0x0"}, - /* cpu_strict */ {false}, - /* poll */ {50}, - /* n_gpu_layers */ {99}, - /* rpc_servers */ {""}, - /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, - /* main_gpu */ {0}, - /* no_kv_offload */ {false}, - /* flash_attn */ {false}, - /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)}, - /* use_mmap */ {true}, - /* embeddings */ {false}, + /* n_batch */ { 2048 }, + /* n_ubatch */ { 512 }, + /* type_k */ { GGML_TYPE_F16 }, + /* type_v */ { GGML_TYPE_F16 }, + /* n_threads */ { cpu_get_num_math() }, + /* cpu_mask */ { "0x0" }, + /* cpu_strict */ { false }, + /* poll */ { 50 }, + /* n_gpu_layers */ { 99 }, + /* rpc_servers */ { "" }, + /* split_mode */ { LLAMA_SPLIT_MODE_LAYER }, + /* main_gpu */ { 0 }, + /* no_kv_offload */ { false }, + /* flash_attn */ { false }, + /* tensor_split */ { std::vector(llama_max_devices(), 0.0f) }, + /* use_mmap */ { true }, + /* embeddings */ { false }, /* numa */ GGML_NUMA_STRATEGY_DISABLED, /* reps */ 5, /* prio */ GGML_SCHED_PRIO_NORMAL, @@ -218,38 +225,59 @@ static void print_usage(int /* argc */, char ** argv) { printf("options:\n"); printf(" -h, --help\n"); printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); - printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); + printf(" -p, --n-prompt (default: %s)\n", + join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); - printf(" -pg (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); - printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); - printf(" -ub, --ubatch-size (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str()); - printf(" -ctk, --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); - printf(" -ctv, --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); - printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); - printf(" -C, --cpu-mask (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str()); - printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str()); + printf(" -pg (default: %s)\n", + join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); + printf(" -b, --batch-size (default: %s)\n", + join(cmd_params_defaults.n_batch, ",").c_str()); + printf(" -ub, --ubatch-size (default: %s)\n", + join(cmd_params_defaults.n_ubatch, ",").c_str()); + printf(" -ctk, --cache-type-k (default: %s)\n", + join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); + printf(" -ctv, --cache-type-v (default: %s)\n", + join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); + printf(" -t, --threads (default: %s)\n", + join(cmd_params_defaults.n_threads, ",").c_str()); + printf(" -C, --cpu-mask (default: %s)\n", + join(cmd_params_defaults.cpu_mask, ",").c_str()); + printf(" --cpu-strict <0|1> (default: %s)\n", + join(cmd_params_defaults.cpu_strict, ",").c_str()); printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str()); - printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" -ngl, --n-gpu-layers (default: %s)\n", + join(cmd_params_defaults.n_gpu_layers, ",").c_str()); if (llama_supports_rpc()) { - printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); + printf(" -rpc, --rpc (default: %s)\n", + join(cmd_params_defaults.rpc_servers, ",").c_str()); } - printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); - printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); - printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); - printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); - printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); + printf(" -sm, --split-mode (default: %s)\n", + join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); + printf(" -mg, --main-gpu (default: %s)\n", + join(cmd_params_defaults.main_gpu, ",").c_str()); + printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", + join(cmd_params_defaults.no_kv_offload, ",").c_str()); + printf(" -fa, --flash-attn <0|1> (default: %s)\n", + join(cmd_params_defaults.flash_attn, ",").c_str()); + printf(" -mmp, --mmap <0|1> (default: %s)\n", + join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" --numa (default: disabled)\n"); - printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); + printf(" -embd, --embeddings <0|1> (default: %s)\n", + join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -ts, --tensor-split (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio); printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay); - printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); - printf(" -oe, --output-err (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr)); + printf(" -o, --output (default: %s)\n", + output_format_str(cmd_params_defaults.output_format)); + printf(" -oe, --output-err (default: %s)\n", + output_format_str(cmd_params_defaults.output_format_stderr)); printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0"); printf("\n"); - printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); + printf( + "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter " + "multiple times.\n"); } static ggml_type ggml_type_from_name(const std::string & s) { @@ -281,22 +309,21 @@ static ggml_type ggml_type_from_name(const std::string & s) { return GGML_TYPE_COUNT; } - static cmd_params parse_cmd_params(int argc, char ** argv) { - cmd_params params; - std::string arg; - bool invalid_param = false; - const std::string arg_prefix = "--"; - const char split_delim = ','; + cmd_params params; + std::string arg; + bool invalid_param = false; + const std::string arg_prefix = "--"; + const char split_delim = ','; - params.verbose = cmd_params_defaults.verbose; - params.output_format = cmd_params_defaults.output_format; + params.verbose = cmd_params_defaults.verbose; + params.output_format = cmd_params_defaults.output_format; params.output_format_stderr = cmd_params_defaults.output_format_stderr; - params.reps = cmd_params_defaults.reps; - params.numa = cmd_params_defaults.numa; - params.prio = cmd_params_defaults.prio; - params.delay = cmd_params_defaults.delay; - params.progress = cmd_params_defaults.progress; + params.reps = cmd_params_defaults.reps; + params.numa = cmd_params_defaults.numa; + params.prio = cmd_params_defaults.prio; + params.delay = cmd_params_defaults.delay; + params.progress = cmd_params_defaults.progress; for (int i = 1; i < argc; i++) { arg = argv[i]; @@ -338,7 +365,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])}); + params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) }); } else if (arg == "-b" || arg == "--batch-size") { if (++i >= argc) { invalid_param = true; @@ -358,7 +385,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); std::vector types; for (const auto & t : p) { ggml_type gt = ggml_type_from_name(t); @@ -377,7 +404,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); std::vector types; for (const auto & t : p) { ggml_type gt = ggml_type_from_name(t); @@ -437,7 +464,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); std::vector modes; for (const auto & m : p) { llama_split_mode mode; @@ -476,10 +503,16 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } else { std::string value(argv[i]); - /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } - else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } - else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } - else { invalid_param = true; break; } + /**/ if (value == "distribute" || value == "") { + params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; + } else if (value == "isolate") { + params.numa = GGML_NUMA_STRATEGY_ISOLATE; + } else if (value == "numactl") { + params.numa = GGML_NUMA_STRATEGY_NUMACTL; + } else { + invalid_param = true; + break; + } } } else if (arg == "-fa" || arg == "--flash-attn") { if (++i >= argc) { @@ -509,9 +542,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } for (auto ts : string_split(argv[i], split_delim)) { // split string by ; and / - const std::regex regex{R"([;/]+)"}; - std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1}; - std::vector split_arg{it, {}}; + const std::regex regex{ R"([;/]+)" }; + std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 }; + std::vector split_arg{ it, {} }; GGML_ASSERT(split_arg.size() <= llama_max_devices()); std::vector tensor_split(llama_max_devices()); @@ -570,52 +603,94 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } // set defaults - if (params.model.empty()) { params.model = cmd_params_defaults.model; } - if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } - if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; } - if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; } - if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } - if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; } - if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } - if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } - if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } - if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; } - if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } - if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } - if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } - if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; } - if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } - if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } - if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } - if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } - if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; } - if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; } - if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; } + if (params.model.empty()) { + params.model = cmd_params_defaults.model; + } + if (params.n_prompt.empty()) { + params.n_prompt = cmd_params_defaults.n_prompt; + } + if (params.n_gen.empty()) { + params.n_gen = cmd_params_defaults.n_gen; + } + if (params.n_pg.empty()) { + params.n_pg = cmd_params_defaults.n_pg; + } + if (params.n_batch.empty()) { + params.n_batch = cmd_params_defaults.n_batch; + } + if (params.n_ubatch.empty()) { + params.n_ubatch = cmd_params_defaults.n_ubatch; + } + if (params.type_k.empty()) { + params.type_k = cmd_params_defaults.type_k; + } + if (params.type_v.empty()) { + params.type_v = cmd_params_defaults.type_v; + } + if (params.n_gpu_layers.empty()) { + params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; + } + if (params.rpc_servers.empty()) { + params.rpc_servers = cmd_params_defaults.rpc_servers; + } + if (params.split_mode.empty()) { + params.split_mode = cmd_params_defaults.split_mode; + } + if (params.main_gpu.empty()) { + params.main_gpu = cmd_params_defaults.main_gpu; + } + if (params.no_kv_offload.empty()) { + params.no_kv_offload = cmd_params_defaults.no_kv_offload; + } + if (params.flash_attn.empty()) { + params.flash_attn = cmd_params_defaults.flash_attn; + } + if (params.tensor_split.empty()) { + params.tensor_split = cmd_params_defaults.tensor_split; + } + if (params.use_mmap.empty()) { + params.use_mmap = cmd_params_defaults.use_mmap; + } + if (params.embeddings.empty()) { + params.embeddings = cmd_params_defaults.embeddings; + } + if (params.n_threads.empty()) { + params.n_threads = cmd_params_defaults.n_threads; + } + if (params.cpu_mask.empty()) { + params.cpu_mask = cmd_params_defaults.cpu_mask; + } + if (params.cpu_strict.empty()) { + params.cpu_strict = cmd_params_defaults.cpu_strict; + } + if (params.poll.empty()) { + params.poll = cmd_params_defaults.poll; + } return params; } struct cmd_params_instance { - std::string model; - int n_prompt; - int n_gen; - int n_batch; - int n_ubatch; - ggml_type type_k; - ggml_type type_v; - int n_threads; - std::string cpu_mask; - bool cpu_strict; - int poll; - int n_gpu_layers; - std::string rpc_servers; - llama_split_mode split_mode; - int main_gpu; - bool no_kv_offload; - bool flash_attn; + std::string model; + int n_prompt; + int n_gen; + int n_batch; + int n_ubatch; + ggml_type type_k; + ggml_type type_v; + int n_threads; + std::string cpu_mask; + bool cpu_strict; + int poll; + int n_gpu_layers; + std::string rpc_servers; + llama_split_mode split_mode; + int main_gpu; + bool no_kv_offload; + bool flash_attn; std::vector tensor_split; - bool use_mmap; - bool embeddings; + bool use_mmap; + bool embeddings; llama_model_params to_llama_mparams() const { llama_model_params mparams = llama_model_default_params(); @@ -624,35 +699,31 @@ struct cmd_params_instance { if (!rpc_servers.empty()) { mparams.rpc_servers = rpc_servers.c_str(); } - mparams.split_mode = split_mode; - mparams.main_gpu = main_gpu; + mparams.split_mode = split_mode; + mparams.main_gpu = main_gpu; mparams.tensor_split = tensor_split.data(); - mparams.use_mmap = use_mmap; + mparams.use_mmap = use_mmap; return mparams; } bool equal_mparams(const cmd_params_instance & other) const { - return model == other.model && - n_gpu_layers == other.n_gpu_layers && - rpc_servers == other.rpc_servers && - split_mode == other.split_mode && - main_gpu == other.main_gpu && - use_mmap == other.use_mmap && + return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers == other.rpc_servers && + split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split; } llama_context_params to_llama_cparams() const { llama_context_params cparams = llama_context_default_params(); - cparams.n_ctx = n_prompt + n_gen; - cparams.n_batch = n_batch; - cparams.n_ubatch = n_ubatch; - cparams.type_k = type_k; - cparams.type_v = type_v; + cparams.n_ctx = n_prompt + n_gen; + cparams.n_batch = n_batch; + cparams.n_ubatch = n_ubatch; + cparams.type_k = type_k; + cparams.type_v = type_v; cparams.offload_kqv = !no_kv_offload; - cparams.flash_attn = flash_attn; - cparams.embeddings = embeddings; + cparams.flash_attn = flash_attn; + cparams.embeddings = embeddings; return cparams; } @@ -662,6 +733,7 @@ static std::vector get_cmd_params_instances(const cmd_param std::vector instances; // this ordering minimizes the number of times that each model needs to be reloaded + // clang-format off for (const auto & m : params.model) for (const auto & nl : params.n_gpu_layers) for (const auto & rpc : params.rpc_servers) @@ -767,100 +839,94 @@ static std::vector get_cmd_params_instances(const cmd_param instances.push_back(instance); } } + // clang-format on return instances; } struct test { static const std::string build_commit; - static const int build_number; + static const int build_number; static const std::string cpu_info; static const std::string gpu_info; - std::string model_filename; - std::string model_type; - uint64_t model_size; - uint64_t model_n_params; - int n_batch; - int n_ubatch; - int n_threads; - std::string cpu_mask; - bool cpu_strict; - int poll; - ggml_type type_k; - ggml_type type_v; - int n_gpu_layers; - llama_split_mode split_mode; - int main_gpu; - bool no_kv_offload; - bool flash_attn; - std::vector tensor_split; - bool use_mmap; - bool embeddings; - int n_prompt; - int n_gen; - std::string test_time; - std::vector samples_ns; + std::string model_filename; + std::string model_type; + uint64_t model_size; + uint64_t model_n_params; + int n_batch; + int n_ubatch; + int n_threads; + std::string cpu_mask; + bool cpu_strict; + int poll; + ggml_type type_k; + ggml_type type_v; + int n_gpu_layers; + llama_split_mode split_mode; + int main_gpu; + bool no_kv_offload; + bool flash_attn; + std::vector tensor_split; + bool use_mmap; + bool embeddings; + int n_prompt; + int n_gen; + std::string test_time; + std::vector samples_ns; test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) { model_filename = inst.model; char buf[128]; llama_model_desc(lmodel, buf, sizeof(buf)); - model_type = buf; - model_size = llama_model_size(lmodel); + model_type = buf; + model_size = llama_model_size(lmodel); model_n_params = llama_model_n_params(lmodel); - n_batch = inst.n_batch; - n_ubatch = inst.n_ubatch; - n_threads = inst.n_threads; - cpu_mask = inst.cpu_mask; - cpu_strict = inst.cpu_strict; - poll = inst.poll; - type_k = inst.type_k; - type_v = inst.type_v; - n_gpu_layers = inst.n_gpu_layers; - split_mode = inst.split_mode; - main_gpu = inst.main_gpu; - no_kv_offload = inst.no_kv_offload; - flash_attn = inst.flash_attn; - tensor_split = inst.tensor_split; - use_mmap = inst.use_mmap; - embeddings = inst.embeddings; - n_prompt = inst.n_prompt; - n_gen = inst.n_gen; + n_batch = inst.n_batch; + n_ubatch = inst.n_ubatch; + n_threads = inst.n_threads; + cpu_mask = inst.cpu_mask; + cpu_strict = inst.cpu_strict; + poll = inst.poll; + type_k = inst.type_k; + type_v = inst.type_v; + n_gpu_layers = inst.n_gpu_layers; + split_mode = inst.split_mode; + main_gpu = inst.main_gpu; + no_kv_offload = inst.no_kv_offload; + flash_attn = inst.flash_attn; + tensor_split = inst.tensor_split; + use_mmap = inst.use_mmap; + embeddings = inst.embeddings; + n_prompt = inst.n_prompt; + n_gen = inst.n_gen; // RFC 3339 date-time format - time_t t = time(NULL); + time_t t = time(NULL); std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t)); test_time = buf; (void) ctx; } - uint64_t avg_ns() const { - return ::avg(samples_ns); - } + uint64_t avg_ns() const { return ::avg(samples_ns); } - uint64_t stdev_ns() const { - return ::stdev(samples_ns); - } + uint64_t stdev_ns() const { return ::stdev(samples_ns); } std::vector get_ts() const { - int n_tokens = n_prompt + n_gen; + int n_tokens = n_prompt + n_gen; std::vector ts; - std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; }); + std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), + [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; }); return ts; } - double avg_ts() const { - return ::avg(get_ts()); - } + double avg_ts() const { return ::avg(get_ts()); } - double stdev_ts() const { - return ::stdev(get_ts()); - } + double stdev_ts() const { return ::stdev(get_ts()); } static std::string get_backend() { std::vector backends; for (size_t i = 0; i < ggml_backend_reg_count(); i++) { - auto * reg = ggml_backend_reg_get(i); + auto * reg = ggml_backend_reg_get(i); std::string name = ggml_backend_reg_name(reg); if (name != "CPU") { backends.push_back(ggml_backend_reg_name(reg)); @@ -871,36 +937,27 @@ struct test { static const std::vector & get_fields() { static const std::vector fields = { - "build_commit", "build_number", - "cpu_info", "gpu_info", "backends", - "model_filename", "model_type", "model_size", "model_n_params", - "n_batch", "n_ubatch", - "n_threads", "cpu_mask", "cpu_strict", "poll", - "type_k", "type_v", - "n_gpu_layers", "split_mode", - "main_gpu", "no_kv_offload", "flash_attn", - "tensor_split", "use_mmap", "embeddings", - "n_prompt", "n_gen", "test_time", - "avg_ns", "stddev_ns", - "avg_ts", "stddev_ts", + "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", + "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", + "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", + "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap", + "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", + "avg_ts", "stddev_ts", }; return fields; } - enum field_type {STRING, BOOL, INT, FLOAT}; + enum field_type { STRING, BOOL, INT, FLOAT }; static field_type get_field_type(const std::string & field) { - if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || - field == "n_threads" || field == "poll" || - field == "model_size" || field == "model_n_params" || - field == "n_gpu_layers" || field == "main_gpu" || - field == "n_prompt" || field == "n_gen" || - field == "avg_ns" || field == "stddev_ns") { + if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || + field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || + field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" || + field == "stddev_ns") { return INT; } - if (field == "f16_kv" || field == "no_kv_offload" || - field == "cpu_strict" || - field == "flash_attn" || field == "use_mmap" || field == "embeddings") { + if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || + field == "use_mmap" || field == "embeddings") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -911,7 +968,7 @@ struct test { std::vector get_values() const { std::string tensor_split_str; - int max_nonzero = 0; + int max_nonzero = 0; for (size_t i = 0; i < llama_max_devices(); i++) { if (tensor_split[i] > 0) { max_nonzero = i; @@ -925,29 +982,47 @@ struct test { tensor_split_str += "/"; } } - std::vector values = { - build_commit, std::to_string(build_number), - cpu_info, gpu_info, get_backend(), - model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), - std::to_string(n_batch), std::to_string(n_ubatch), - std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll), - ggml_type_name(type_k), ggml_type_name(type_v), - std::to_string(n_gpu_layers), split_mode_str(split_mode), - std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn), - tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), - std::to_string(n_prompt), std::to_string(n_gen), test_time, - std::to_string(avg_ns()), std::to_string(stdev_ns()), - std::to_string(avg_ts()), std::to_string(stdev_ts()) - }; + std::vector values = { build_commit, + std::to_string(build_number), + cpu_info, + gpu_info, + get_backend(), + model_filename, + model_type, + std::to_string(model_size), + std::to_string(model_n_params), + std::to_string(n_batch), + std::to_string(n_ubatch), + std::to_string(n_threads), + cpu_mask, + std::to_string(cpu_strict), + std::to_string(poll), + ggml_type_name(type_k), + ggml_type_name(type_v), + std::to_string(n_gpu_layers), + split_mode_str(split_mode), + std::to_string(main_gpu), + std::to_string(no_kv_offload), + std::to_string(flash_attn), + tensor_split_str, + std::to_string(use_mmap), + std::to_string(embeddings), + std::to_string(n_prompt), + std::to_string(n_gen), + test_time, + std::to_string(avg_ns()), + std::to_string(stdev_ns()), + std::to_string(avg_ts()), + std::to_string(stdev_ts()) }; return values; } std::map get_map() const { std::map map; - auto fields = get_fields(); - auto values = get_values(); - std::transform(fields.begin(), fields.end(), values.begin(), - std::inserter(map, map.end()), std::make_pair); + auto fields = get_fields(); + auto values = get_values(); + std::transform(fields.begin(), fields.end(), values.begin(), std::inserter(map, map.end()), + std::make_pair); return map; } }; @@ -961,9 +1036,12 @@ struct printer { virtual ~printer() {} FILE * fout; + virtual void print_header(const cmd_params & params) { (void) params; } + virtual void print_test(const test & t) = 0; - virtual void print_footer() { } + + virtual void print_footer() {} }; struct csv_printer : public printer { @@ -979,7 +1057,7 @@ struct csv_printer : public printer { return escaped; } - void print_header(const cmd_params & params) override { + void print_header(const cmd_params & params) override { std::vector fields = test::get_fields(); fprintf(fout, "%s\n", join(fields, ",").c_str()); (void) params; @@ -992,7 +1070,6 @@ struct csv_printer : public printer { } }; - static std::string escape_json(const std::string & value) { std::string escaped; for (auto c : value) { @@ -1000,7 +1077,7 @@ static std::string escape_json(const std::string & value) { escaped += "\\\""; } else if (c == '\\') { escaped += "\\\\"; - } else if (c <= 0x1f) { + } else if (c <= 0x1f) { char buf[8]; snprintf(buf, sizeof(buf), "\\u%04x", c); escaped += buf; @@ -1033,7 +1110,8 @@ struct json_printer : public printer { void print_fields(const std::vector & fields, const std::vector & values) { assert(fields.size() == values.size()); for (size_t i = 0; i < fields.size(); i++) { - fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str()); + fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), + format_json_value(fields.at(i), values.at(i)).c_str()); } } @@ -1051,12 +1129,9 @@ struct json_printer : public printer { fflush(fout); } - void print_footer() override { - fprintf(fout, "\n]\n"); - } + void print_footer() override { fprintf(fout, "\n]\n"); } }; - struct jsonl_printer : public printer { void print_fields(const std::vector & fields, const std::vector & values) { assert(fields.size() == values.size()); @@ -1116,7 +1191,7 @@ struct markdown_printer : public printer { return 13; } - int width = std::max((int)field.length(), 10); + int width = std::max((int) field.length(), 10); if (test::get_field_type(field) == test::STRING) { return -width; @@ -1230,18 +1305,18 @@ struct markdown_printer : public printer { fprintf(fout, "|"); for (const auto & field : fields) { std::string value; - char buf[128]; + char buf[128]; if (field == "model") { value = t.model_type; } else if (field == "size") { - if (t.model_size < 1024*1024*1024) { + if (t.model_size < 1024 * 1024 * 1024) { snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0); } else { snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0); } value = buf; } else if (field == "params") { - if (t.model_n_params < 1000*1000*1000) { + if (t.model_n_params < 1000 * 1000 * 1000) { snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6); } else { snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9); @@ -1303,7 +1378,8 @@ struct sql_printer : public printer { std::vector fields = test::get_fields(); fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n"); for (size_t i = 0; i < fields.size(); i++) { - fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(), i < fields.size() - 1 ? "," : ""); + fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(), + i < fields.size() - 1 ? "," : ""); } fprintf(fout, ");\n"); fprintf(fout, "\n"); @@ -1324,8 +1400,8 @@ struct sql_printer : public printer { static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); - const llama_model * model = llama_get_model(ctx); - const int32_t n_vocab = llama_n_vocab(model); + const llama_model * model = llama_get_model(ctx); + const int32_t n_vocab = llama_n_vocab(model); std::vector tokens(n_batch); @@ -1333,7 +1409,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th while (n_processed < n_prompt) { int n_tokens = std::min(n_prompt - n_processed, n_batch); - tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; + tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; for (int i = 1; i < n_tokens; i++) { tokens[i] = std::rand() % n_vocab; } @@ -1347,8 +1423,8 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th static void test_gen(llama_context * ctx, int n_gen, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); - const llama_model * model = llama_get_model(ctx); - const int32_t n_vocab = llama_n_vocab(model); + const llama_model * model = llama_get_model(ctx); + const int32_t n_vocab = llama_n_vocab(model); llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; @@ -1411,7 +1487,7 @@ int main(int argc, char ** argv) { set_process_priority(params.prio); // initialize printer - std::unique_ptr p = create_printer(params.output_format); + std::unique_ptr p = create_printer(params.output_format); std::unique_ptr p_err = create_printer(params.output_format_stderr); if (p) { @@ -1426,13 +1502,13 @@ int main(int argc, char ** argv) { std::vector params_instances = get_cmd_params_instances(params); - llama_model * lmodel = nullptr; + llama_model * lmodel = nullptr; const cmd_params_instance * prev_inst = nullptr; - int params_idx = 0; + int params_idx = 0; auto params_count = params_instances.size(); for (const auto & inst : params_instances) { - params_idx ++; + params_idx++; if (params.progress) { fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count); } @@ -1475,7 +1551,7 @@ int main(int argc, char ** argv) { tpp.poll = t.poll; tpp.prio = params.prio; - struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp); + struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); if (!threadpool) { fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); exit(1); @@ -1505,13 +1581,15 @@ int main(int argc, char ** argv) { if (t.n_prompt > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps); + fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, + i + 1, params.reps); } test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); } if (t.n_gen > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps); + fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, + i + 1, params.reps); } test_gen(ctx, t.n_gen, t.n_threads); } From f95caa79546271722ada703da20ffb1cfcd21fed Mon Sep 17 00:00:00 2001 From: bandoti <141645996+bandoti@users.noreply.github.com> Date: Wed, 20 Nov 2024 12:22:19 -0400 Subject: [PATCH 05/14] cmake: add link dependencies to cmake find pkg (#10433) * cmake pkg: find accelerate, openmp, memkind libs * cmake pkg: find BLAS libs * try BLAS_LIBRARIES instead * Add BLAS link opts * Add more link deps. and set GGML_ vars --- cmake/llama-config.cmake.in | 136 +++++++++++++++++++++++++++--------- 1 file changed, 104 insertions(+), 32 deletions(-) diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in index 28a8c18b6..5c55bc6b8 100644 --- a/cmake/llama-config.cmake.in +++ b/cmake/llama-config.cmake.in @@ -3,12 +3,60 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@) set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@) set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) +set(GGML_STATIC @GGML_STATIC@) +set(GGML_NATIVE @GGML_NATIVE@) +set(GGML_LTO @GGML_LTO@) +set(GGML_CCACHE @GGML_CCACHE@) +set(GGML_AVX @GGML_AVX@) +set(GGML_AVX2 @GGML_AVX2@) +set(GGML_AVX512 @GGML_AVX512@) +set(GGML_AVX512_VBMI @GGML_AVX512_VBMI@) +set(GGML_AVX512_VNNI @GGML_AVX512_VNNI@) +set(GGML_AVX512_BF16 @GGML_AVX512_BF16@) +set(GGML_AMX_TILE @GGML_AMX_TILE@) +set(GGML_AMX_INT8 @GGML_AMX_INT8@) +set(GGML_AMX_BF16 @GGML_AMX_BF16@) +set(GGML_FMA @GGML_FMA@) +set(GGML_LASX @GGML_LASX@) +set(GGML_LSX @GGML_LSX@) +set(GGML_RVV @GGML_RVV@) +set(GGML_SVE @GGML_SVE@) + set(GGML_ACCELERATE @GGML_ACCELERATE@) +set(GGML_OPENMP @GGML_OPENMP@) +set(GGML_CPU_HBM @GGML_CPU_HBM@) +set(GGML_BLAS_VENDOR @GGML_BLAS_VENDOR@) + +set(GGML_CUDA_FORCE_MMQ @GGML_CUDA_FORCE_MMQ@) +set(GGML_CUDA_FORCE_CUBLAS @GGML_CUDA_FORCE_CUBLAS@) +set(GGML_CUDA_F16 @GGML_CUDA_F16@) +set(GGML_CUDA_PEER_MAX_BATCH_SIZE @GGML_CUDA_PEER_MAX_BATCH_SIZE@) +set(GGML_CUDA_NO_PEER_COPY @GGML_CUDA_NO_PEER_COPY@) +set(GGML_CUDA_NO_VMM @GGML_CUDA_NO_VMM@) +set(GGML_CUDA_FA_ALL_QUANTS @GGML_CUDA_FA_ALL_QUANTS@) +set(GGML_CUDA_GRAPHS @GGML_CUDA_GRAPHS@) + +set(GGML_HIP_UMA @GGML_HIP_UMA@) + set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@) -set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@) -set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@) -set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@) -set(GGML_OPENMP @GGML_OPENMP@) +set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@) +set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@) +set(GGML_VULKAN_SHADER_DEBUG_INFO @GGML_VULKAN_SHADER_DEBUG_INFO@) +set(GGML_VULKAN_PERF @GGML_VULKAN_PERF@) +set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@) +set(GGML_VULKAN_RUN_TESTS @GGML_VULKAN_RUN_TESTS@) + +set(GGML_METAL_USE_BF16 @GGML_METAL_USE_BF16@) +set(GGML_METAL_NDEBUG @GGML_METAL_NDEBUG@) +set(GGML_METAL_SHADER_DEBUG @GGML_METAL_SHADER_DEBUG@) +set(GGML_METAL_EMBED_LIBRARY @GGML_METAL_EMBED_LIBRARY@) +set(GGML_METAL_MACOSX_VERSION_MIN @GGML_METAL_MACOSX_VERSION_MIN@) +set(GGML_METAL_STD @GGML_METAL_STD@) + +set(GGML_SYCL_F16 @GGML_SYCL_F16@) +set(GGML_SYCL_TARGET @GGML_SYCL_TARGET@) +set(GGML_SYCL_DEVICE_ARCH @GGML_SYCL_DEVICE_ARCH@) + @PACKAGE_INIT@ @@ -20,6 +68,7 @@ find_package(Threads REQUIRED) set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@") set(_llama_link_deps "") +set(_llama_link_opts "") foreach(_ggml_lib ggml ggml-base) string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY") find_library(${_ggml_lib_var} ${_ggml_lib} @@ -49,41 +98,63 @@ foreach(backend amx blas cann cpu cuda hip kompute metal musa rpc sycl vulkan) endif() endforeach() -if (APPLE AND GGML_ACCELERATE) - find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) -endif() +if (NOT LLAMA_SHARED_LIB) + if (APPLE AND GGML_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) + list(APPEND _llama_link_deps ${ACCELERATE_FRAMEWORK}) + endif() -if (GGML_BLAS) - find_package(BLAS REQUIRED) -endif() + if (GGML_OPENMP) + find_package(OpenMP REQUIRED) + list(APPEND _llama_link_deps OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + endif() -if (GGML_CUDA) - find_package(CUDAToolkit REQUIRED) -endif() + if (GGML_CPU_HBM) + find_library(memkind memkind REQUIRED) + list(APPEND _llama_link_deps memkind) + endif() -if (GGML_METAL) - find_library(FOUNDATION_LIBRARY Foundation REQUIRED) - find_library(METAL_FRAMEWORK Metal REQUIRED) - find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) -endif() + if (GGML_BLAS) + find_package(BLAS REQUIRED) + list(APPEND _llama_link_deps ${BLAS_LIBRARIES}) + list(APPEND _llama_link_opts ${BLAS_LINKER_FLAGS}) + endif() -if (GGML_VULKAN) - find_package(Vulkan REQUIRED) -endif() + if (GGML_CUDA) + find_package(CUDAToolkit REQUIRED) + endif() -if (GGML_HIP) - find_package(hip REQUIRED) - find_package(hipblas REQUIRED) - find_package(rocblas REQUIRED) -endif() + if (GGML_METAL) + find_library(FOUNDATION_LIBRARY Foundation REQUIRED) + find_library(METAL_FRAMEWORK Metal REQUIRED) + find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) + list(APPEND _llama_link_deps ${FOUNDATION_LIBRARY} + ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK}) + endif() -if (GGML_SYCL) - find_package(IntelSYCL REQUIRED) - find_package(MKL REQUIRED) -endif() + if (GGML_VULKAN) + find_package(Vulkan REQUIRED) + list(APPEND _llama_link_deps Vulkan::Vulkan) + endif() -if (GGML_OPENMP) - find_package(OpenMP REQUIRED) + if (GGML_HIP) + find_package(hip REQUIRED) + find_package(hipblas REQUIRED) + find_package(rocblas REQUIRED) + list(APPEND _llama_link_deps hip::host roc::rocblas roc::hipblas) + endif() + + if (GGML_SYCL) + find_package(DNNL) + if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL") + list(APPEND _llama_link_deps DNNL::dnnl) + endif() + if (WIN32) + find_package(IntelSYCL REQUIRED) + find_package(MKL REQUIRED) + list(APPEND _llama_link_deps IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) + endif() + endif() endif() find_library(llama_LIBRARY llama @@ -97,6 +168,7 @@ set_target_properties(llama PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}" INTERFACE_LINK_LIBRARIES "${_llama_link_deps}" + INTERFACE_LINK_OPTIONS "${_llama_link_opts}" INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}" IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" IMPORTED_LOCATION "${llama_LIBRARY}" From 9abe9eeae98b11fa93b82632b264126a010225ff Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Wed, 20 Nov 2024 13:47:36 -0600 Subject: [PATCH 06/14] vulkan: predicate max operation in soft_max shaders/soft_max (#10437) Fixes #10434 --- ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp index f9727679e..6e20b6411 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp @@ -73,7 +73,9 @@ void soft_max(uint num_iters) { FLOAT_TYPE v = a * p.scale + slope * b; - max_val = max(max_val, v); + if (col < p.KX) { + max_val = max(max_val, v); + } if (idx < DATA_CACHE_SIZE) { data_cache[idx] = v; From 02e4eaf22f229a114054b053a9eff61483653670 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 20 Nov 2024 14:56:04 +0100 Subject: [PATCH 07/14] ggml-opt: fix data corruption (ggml/1022) --- ggml/src/ggml-backend.cpp | 2 + ggml/src/ggml-impl.h | 3 + ggml/src/ggml-opt.cpp | 147 +++++++++++++++++-------------------- ggml/src/ggml.c | 94 ++++++++++++++---------- tests/test-backend-ops.cpp | 1 - 5 files changed, 129 insertions(+), 118 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 9dcde8d11..3433d082e 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -252,6 +252,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten } void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_ASSERT(tensor); ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; if (size == 0) { @@ -266,6 +267,7 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz } void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_ASSERT(tensor); ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; if (size == 0) { diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 92a64fe5a..3965be787 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -295,6 +295,9 @@ struct ggml_cgraph { enum ggml_cgraph_eval_order order; }; +// returns a slice of cgraph with nodes [i0, i1) +// the slice does not have leafs or gradients +// if you need the gradients, get them from the original graph struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); // Memory allocation diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp index 040205a31..7c3e24103 100644 --- a/ggml/src/ggml-opt.cpp +++ b/ggml/src/ggml-opt.cpp @@ -14,51 +14,51 @@ #include struct ggml_opt_dataset { - struct ggml_context * ctx; - ggml_backend_buffer_t buf; - struct ggml_tensor * data; - struct ggml_tensor * labels; + struct ggml_context * ctx = nullptr; + ggml_backend_buffer_t buf = nullptr; + struct ggml_tensor * data = nullptr; + struct ggml_tensor * labels = nullptr; - int64_t ndata; - int64_t ndata_shard; - size_t nbs_data; - size_t nbs_labels; + int64_t ndata = -1; + int64_t ndata_shard = -1; + size_t nbs_data = -1; + size_t nbs_labels = -1; std::vector permutation; }; struct ggml_opt_context { - ggml_backend_sched_t backend_sched; - ggml_cgraph * allocated_graph; - ggml_cgraph * allocated_graph_copy; - struct ggml_context * ctx_static; - struct ggml_context * ctx_static_cpu; - struct ggml_context * ctx_compute; - struct ggml_context * ctx_copy; - ggml_backend_buffer_t buf_static; - ggml_backend_buffer_t buf_static_cpu; + ggml_backend_sched_t backend_sched = nullptr; + ggml_cgraph * allocated_graph = nullptr; + ggml_cgraph * allocated_graph_copy = nullptr; + struct ggml_context * ctx_static = nullptr; + struct ggml_context * ctx_static_cpu = nullptr; + struct ggml_context * ctx_compute = nullptr; + struct ggml_context * ctx_copy = nullptr; + ggml_backend_buffer_t buf_static = nullptr; + ggml_backend_buffer_t buf_static_cpu = nullptr; std::mt19937 rng; - struct ggml_tensor * inputs; - struct ggml_tensor * outputs; - struct ggml_tensor * labels; + struct ggml_tensor * inputs = nullptr; + struct ggml_tensor * outputs = nullptr; + struct ggml_tensor * labels = nullptr; - struct ggml_tensor * loss; - struct ggml_tensor * pred; - struct ggml_tensor * ncorrect; + struct ggml_tensor * loss = nullptr; + struct ggml_tensor * pred = nullptr; + struct ggml_tensor * ncorrect = nullptr; - struct ggml_cgraph * gf; - struct ggml_cgraph * gb_grad; - struct ggml_cgraph * gb_opt; + struct ggml_cgraph * gf = nullptr; + struct ggml_cgraph * gb_grad = nullptr; + struct ggml_cgraph * gb_opt = nullptr; - int64_t iter; - int32_t opt_period; - int32_t opt_i; - bool loss_per_datapoint; + int64_t iter = 1; + int32_t opt_period = 1; + int32_t opt_i = 0; + bool loss_per_datapoint = false; - ggml_opt_get_optimizer_params get_opt_pars; - void * get_opt_pars_ud; - struct ggml_tensor * adamw_params; + ggml_opt_get_optimizer_params get_opt_pars = nullptr; + void * get_opt_pars_ud = nullptr; + struct ggml_tensor * adamw_params = nullptr; }; struct ggml_opt_result { @@ -67,8 +67,8 @@ struct ggml_opt_result { std::vector pred; int64_t ncorrect = 0; - bool loss_per_datapoint = false; - int64_t opt_period = -1; + int64_t opt_period = -1; + bool loss_per_datapoint = false; }; // ====== Dataset ====== @@ -188,11 +188,11 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us } struct ggml_opt_params ggml_opt_default_params( - ggml_backend_sched_t backend_sched, - struct ggml_context * ctx_compute, - struct ggml_tensor * inputs, - struct ggml_tensor * outputs, - enum ggml_opt_loss_type loss_type) { + ggml_backend_sched_t backend_sched, + struct ggml_context * ctx_compute, + struct ggml_tensor * inputs, + struct ggml_tensor * outputs, + enum ggml_opt_loss_type loss_type) { return { /*backend_sched =*/ backend_sched, /*ctx_compute =*/ ctx_compute, @@ -237,25 +237,33 @@ static ggml_tensor * map_tensor(std::map & tensor_ return new_tensor; } -static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * graph) { +static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) { std::map tensor_map; - ggml_cgraph * new_graph = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); + ggml_cgraph * dst = ggml_new_graph_custom(ctx, src->size, /*grads =*/ true); - for (int i = 0; i < graph->n_leafs; i++) { - ggml_build_forward_expand(new_graph, map_tensor(tensor_map, ctx, graph->leafs[i])); + for (int i = 0; i < src->n_leafs; i++) { + ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i])); } - for (int i = 0; i < graph->n_nodes; i++) { - ggml_build_forward_expand(new_graph, map_tensor(tensor_map, ctx, graph->nodes[i])); + GGML_ASSERT(dst->n_leafs == src->n_leafs); + for (int i = 0; i < src->n_nodes; i++) { + ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i])); } - for (int i = 0; i < graph->n_nodes; ++i) { - const size_t igrad_src = ggml_hash_find(&graph->visited_hash_set, graph->nodes[i]); - const size_t igrad_dst = ggml_hash_find(&new_graph->visited_hash_set, new_graph->nodes[i]); - graph->grads[igrad_dst] = new_graph->grads[igrad_src]; - graph->grad_accs[igrad_dst] = new_graph->grad_accs[igrad_src]; + GGML_ASSERT(dst->n_nodes == src->n_nodes); + for (int i = 0; i < src->n_nodes; ++i) { + const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]); + const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]); + + GGML_ASSERT(igrad_src != GGML_HASHSET_FULL); + GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src)); + GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL); + GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst)); + + dst->grads[igrad_dst] = src->grads[igrad_src]; + dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src]; } - return new_graph; + return dst; } static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph) { @@ -284,18 +292,13 @@ static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { ggml_opt_context_t result = new struct ggml_opt_context; - result->backend_sched = params.backend_sched; - result->allocated_graph = nullptr; - result->allocated_graph_copy = nullptr; - result->ctx_compute = params.ctx_compute; - result->ctx_copy = nullptr; - result->inputs = params.inputs; - result->outputs = params.outputs; - result->iter = 1; - result->opt_period = params.opt_period; - result->opt_i = 0; - result->get_opt_pars = params.get_opt_pars; - result->get_opt_pars_ud = params.get_opt_pars_ud; + result->backend_sched = params.backend_sched; + result->ctx_compute = params.ctx_compute; + result->inputs = params.inputs; + result->outputs = params.outputs; + result->opt_period = params.opt_period; + result->get_opt_pars = params.get_opt_pars; + result->get_opt_pars_ud = params.get_opt_pars_ud; GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically"); GGML_ASSERT(result->opt_period >= 1); @@ -348,7 +351,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { switch (params.loss_type) { case GGML_OPT_LOSS_TYPE_MEAN: { - result->labels = nullptr; result->loss = ggml_sum(result->ctx_static, result->outputs); ggml_set_name(result->loss, "loss_sum"); const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs)); @@ -358,7 +360,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { break; } case GGML_OPT_LOSS_TYPE_SUM: { - result->labels = nullptr; result->loss = ggml_sum(result->ctx_static, result->outputs); ggml_set_name(result->loss, "loss_sum"); result->loss_per_datapoint = false; @@ -413,14 +414,7 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { } if (params.build_type == GGML_OPT_BUILD_TYPE_FORWARD) { - result->gb_grad = nullptr; - result->gb_opt = nullptr; - result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0)); - result->buf_static_cpu = nullptr; - - ggml_opt_alloc_graph(result, result->gf); - return result; } @@ -429,14 +423,8 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { ggml_build_backward_expand(result->ctx_static, result->ctx_compute, result->gb_grad, accumulate); if (params.build_type == GGML_OPT_BUILD_TYPE_GRAD) { - result->gb_opt = nullptr; - result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0)); - result->buf_static_cpu = nullptr; - - ggml_opt_alloc_graph(result, result->gb_grad); ggml_graph_reset(result->gb_grad); - return result; } @@ -466,7 +454,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { result->buf_static_cpu = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx_static_cpu, ggml_backend_cpu_buffer_type()); - ggml_opt_alloc_graph(result, result->gb_opt); ggml_graph_reset(result->gb_opt); return result; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index ee72a173e..719d75c70 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -5019,8 +5019,10 @@ static void ggml_hash_map_free(struct hash_map * map) { } // utility functions to change gradients -// if a is in acc_table, modify gradients in-place and mark result as gradient accumulator -// else if a is in zero_table, replace a +// isrc is the index of tensor in cgraph->visited_has_set.keys +// the corresponding gradient (accumulators) are also at position isrc +// if tensor has a gradient accumulator, modify that accumulator in-place +// else if there is no gradient for tensor, set the corresponding value // else, just add/subtract/etc. the gradients static void ggml_add_or_set( @@ -5028,11 +5030,14 @@ static void ggml_add_or_set( struct ggml_cgraph * cgraph, size_t isrc, struct ggml_tensor * tensor) { + struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc]; + GGML_ASSERT(src); if (cgraph->grads[isrc]) { - cgraph->grads[isrc] = ggml_add_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]); + cgraph->grads[isrc] = ggml_add_impl(ctx, cgraph->grads[isrc], tensor, /*inplace =*/ cgraph->grad_accs[isrc]); } else { cgraph->grads[isrc] = tensor; } + ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name); ggml_build_forward_expand(cgraph, cgraph->grads[isrc]); } @@ -5040,18 +5045,20 @@ static void ggml_acc_or_set( struct ggml_context * ctx, struct ggml_cgraph * cgraph, size_t isrc, - struct ggml_tensor * src, struct ggml_tensor * tensor, const size_t nb1, const size_t nb2, const size_t nb3, const size_t offset) { + struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc]; + GGML_ASSERT(src); if (cgraph->grads[isrc]) { cgraph->grads[isrc] = ggml_acc_impl(ctx, cgraph->grads[isrc], tensor, nb1, nb2, nb3, offset, cgraph->grad_accs[isrc]); } else { struct ggml_tensor * a_zero = ggml_scale(ctx, src, 0.0f); // FIXME this is going to produce NaN if a contains inf/NaN cgraph->grads[isrc] = ggml_acc_impl(ctx, a_zero, tensor, nb1, nb2, nb3, offset, false); } + ggml_format_name(cgraph->grads[isrc], "grad for %s", cgraph->visited_hash_set.keys[isrc]->name); ggml_build_forward_expand(cgraph, cgraph->grads[isrc]); } @@ -5059,13 +5066,15 @@ static void ggml_add1_or_set( struct ggml_context * ctx, struct ggml_cgraph * cgraph, size_t isrc, - struct ggml_tensor * src, struct ggml_tensor * tensor) { + struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc]; + GGML_ASSERT(src); if (cgraph->grads[isrc]) { cgraph->grads[isrc] = ggml_add1_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]); } else { cgraph->grads[isrc] = ggml_repeat(ctx, tensor, src); } + ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name); ggml_build_forward_expand(cgraph, cgraph->grads[isrc]); } @@ -5074,11 +5083,14 @@ static void ggml_sub_or_set( struct ggml_cgraph * cgraph, size_t isrc, struct ggml_tensor * tensor) { + struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc]; + GGML_ASSERT(src); if (cgraph->grads[isrc]) { cgraph->grads[isrc] = ggml_sub_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]); } else { cgraph->grads[isrc] = ggml_neg(ctx, tensor); } + ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name); ggml_build_forward_expand(cgraph, cgraph->grads[isrc]); } @@ -5095,12 +5107,12 @@ static void ggml_compute_backward( struct ggml_tensor * src1 = tensor->src[1]; struct ggml_tensor * src2 = tensor->src[2]; struct ggml_hash_set * hash_set = &cgraph->visited_hash_set; - const size_t isrc0 = ggml_hash_find(hash_set, src0); - const size_t isrc1 = ggml_hash_find(hash_set, src1); - const size_t isrc2 = ggml_hash_find(hash_set, src2); - const bool src0_needs_grads = isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0]; - const bool src1_needs_grads = isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1]; - const bool src2_needs_grads = isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2]; + const size_t isrc0 = src0 ? ggml_hash_find(hash_set, src0) : (size_t) -1; + const size_t isrc1 = src1 ? ggml_hash_find(hash_set, src1) : (size_t) -1; + const size_t isrc2 = src2 ? ggml_hash_find(hash_set, src2) : (size_t) -1; + const bool src0_needs_grads = src0 && isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0]; + const bool src1_needs_grads = src1 && isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1]; + const bool src2_needs_grads = src2 && isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2]; switch (tensor->op) { case GGML_OP_DUP: { @@ -5200,7 +5212,7 @@ static void ggml_compute_backward( } break; case GGML_OP_SUM: { if (src0_needs_grads) { - ggml_add1_or_set(ctx, cgraph, isrc0, src0, grad); + ggml_add1_or_set(ctx, cgraph, isrc0, grad); } } break; case GGML_OP_SUM_ROWS: { @@ -5210,7 +5222,7 @@ static void ggml_compute_backward( } break; case GGML_OP_MEAN: { if (src0_needs_grads) { - ggml_add1_or_set(ctx, cgraph, isrc0, src0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false)); + ggml_add1_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false)); } } break; case GGML_OP_REPEAT: { @@ -5363,7 +5375,7 @@ static void ggml_compute_backward( nb3 = (nb3 / n0) * ng; } - ggml_acc_or_set(ctx, cgraph, isrc0, src0, grad, nb1, nb2, nb3, offset); + ggml_acc_or_set(ctx, cgraph, isrc0, grad, nb1, nb2, nb3, offset); } } break; case GGML_OP_PERMUTE: { @@ -5597,10 +5609,9 @@ void ggml_build_backward_expand( const int n_nodes_f = cgraph->n_nodes; - const size_t hash_size = ggml_hash_size(2*cgraph->size); - memset(cgraph->grads, 0, hash_size*sizeof(struct ggml_tensor *)); - memset(cgraph->grad_accs, 0, hash_size*sizeof(struct ggml_tensor *)); - bool * grads_needed = calloc(hash_size, sizeof(bool)); + memset(cgraph->grads, 0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *)); + memset(cgraph->grad_accs, 0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *)); + bool * grads_needed = calloc(cgraph->visited_hash_set.size, sizeof(bool)); { bool any_params = false; @@ -5621,7 +5632,7 @@ void ggml_build_backward_expand( continue; } - bool node_needs_grad = node->flags & GGML_TENSOR_FLAG_PARAM; + bool node_needs_grad = (node->flags & GGML_TENSOR_FLAG_PARAM) || (node->flags & GGML_TENSOR_FLAG_LOSS); bool ignore_src[GGML_MAX_SRC] = {false}; switch (node->op) { // gradients in node->src[0] for one reason or another have no effect on output gradients @@ -5638,7 +5649,7 @@ void ggml_build_backward_expand( } break; // gradients in node->src[1] for one reason or another have no effect on output gradients - case GGML_OP_CPY: // gradients in CPY target are irrelevant + case GGML_OP_CPY: // gradients in CPY target are irrelevant case GGML_OP_GET_ROWS: // row indices not differentiable case GGML_OP_GET_ROWS_BACK: // same as for GET_ROWS case GGML_OP_ROPE: // positions not differentiable @@ -5665,9 +5676,12 @@ void ggml_build_backward_expand( node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE); const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node); + GGML_ASSERT(igrad != GGML_HASHSET_FULL); + GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, igrad)); if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) { - cgraph->grads[igrad] = ggml_dup_tensor(ctx_static, node); - cgraph->grad_accs[igrad] = cgraph->grads[igrad]; + cgraph->grad_accs[igrad] = ggml_dup_tensor(ctx_static, node); + cgraph->grads[igrad] = cgraph->grad_accs[igrad]; + ggml_format_name(cgraph->grad_accs[igrad], "grad acc for %s", node->name); } grads_needed[igrad] = true; } @@ -5761,15 +5775,15 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) { struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) { struct ggml_cgraph cgraph = { - /*.size =*/ 0, - /*.n_nodes =*/ i1 - i0, - /*.n_leafs =*/ 0, - /*.nodes =*/ cgraph0->nodes + i0, - /*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL, - /*.grad_accs =*/ cgraph0->grad_accs ? cgraph0->grad_accs + i0 : NULL, - /*.leafs =*/ NULL, - /*.hash_table =*/ { 0, NULL, NULL }, - /*.order =*/ cgraph0->order, + /*.size =*/ 0, + /*.n_nodes =*/ i1 - i0, + /*.n_leafs =*/ 0, + /*.nodes =*/ cgraph0->nodes + i0, + /*.grads =*/ NULL, // gradients would need visited_hash_set + /*.grad_accs =*/ NULL, + /*.leafs =*/ NULL, + /*.visited_hash_set =*/ { 0, NULL, NULL }, + /*.order =*/ cgraph0->order, }; return cgraph; @@ -5799,12 +5813,22 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) { } } + if (dst->grads) { + memset(dst->grads, 0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *)); + memset(dst->grad_accs, 0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *)); + } if (src->grads) { GGML_ASSERT(dst->grads != NULL); GGML_ASSERT(dst->grad_accs != NULL); for (int i = 0; i < src->n_nodes; ++i) { const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]); const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]); + + GGML_ASSERT(igrad_src != GGML_HASHSET_FULL); + GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src)); + GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL); + GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst)); + dst->grads[igrad_dst] = src->grads[igrad_src]; dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src]; } @@ -5839,12 +5863,8 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { if (node->op == GGML_OP_OPT_STEP_ADAMW) { // clear momenta - if (node->src[2]->data) { - ggml_set_zero(node->src[2]); - } - if (node->src[3]->data) { - ggml_set_zero(node->src[3]); - } + ggml_set_zero(node->src[2]); + ggml_set_zero(node->src[3]); } // initial gradients of loss should be 1, 0 otherwise diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 01ac7166e..37342c156 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -819,7 +819,6 @@ struct test_case { } } - // TODO: refactor so that this check is only needed once for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { if (!ggml_backend_supports_op(backend, t)) { printf("not supported [%s] ", ggml_backend_name(backend)); From 59b917282236eadfb82bf1f46a31eb119941da08 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 20 Nov 2024 13:25:08 +0100 Subject: [PATCH 08/14] ggml/sched : do not skip views in pre-assignments --- ggml/src/ggml-backend.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 3433d082e..45da0c27d 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -886,9 +886,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; int * node_backend_id = &tensor_backend_id(node); - if (ggml_is_view_op(node->op)) { - continue; - } // do not overwrite user assignments if (*node_backend_id == -1) { *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node); From 87a533be57e602f8ca469d14ad15ee851265b655 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 21 Nov 2024 09:22:11 +0200 Subject: [PATCH 09/14] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index e9bd2dbb0..d101d2b57 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -2884dd72fea8922910fe53387c3d17ab928d3a8e +6fcbd60bc72ac3f7ad43f78c87e535f2e6206f58 From 1bb30bf28cb5a7adf111bc41c935bdaf128397e7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 21 Nov 2024 10:22:47 +0200 Subject: [PATCH 10/14] llama : handle KV shift for recurrent models (#10402) ggml-ci --- src/llama.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index c51b36e66..001711037 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -18211,13 +18211,13 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { static void llama_kv_cache_update_internal(struct llama_context & lctx) { bool need_reserve = false; - // apply K-shift if needed - if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) { + if (lctx.kv_self.has_shift) { if (!llama_kv_cache_can_shift(&lctx)) { - GGML_ABORT("Deepseek2 does not support K-shift"); + GGML_ABORT("The current context does not support K-shift"); } - { + // apply K-shift if needed + if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { ggml_backend_sched_reset(lctx.sched.get()); ggml_cgraph * gf = llama_build_graph_k_shift(lctx); @@ -20463,7 +20463,7 @@ void llama_kv_cache_update(struct llama_context * ctx) { } bool llama_kv_cache_can_shift(struct llama_context * ctx) { - return ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA + return !ctx->kv_self.recurrent && ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA } // deprecated From a5e47592b6171ae21f3eaa1aba6fb2b707875063 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Thu, 21 Nov 2024 18:18:50 +0100 Subject: [PATCH 11/14] cuda : optimize argmax (#10441) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * cuda : optimize argmax * remove unused parameter ggml-ci * fixup : use full warps ggml-ci * Apply suggestions from code review Co-authored-by: Johannes Gäßler * fix ub * ggml : check ne00 <= INT32_MAX in argmax and argsort --------- Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/argmax.cu | 108 ++++++++++++++++++--------------- ggml/src/ggml-cuda/common.cuh | 30 ++++----- ggml/src/ggml-cuda/quantize.cu | 8 +-- ggml/src/ggml.c | 2 + tests/test-backend-ops.cpp | 29 +++++++++ 5 files changed, 110 insertions(+), 67 deletions(-) diff --git a/ggml/src/ggml-cuda/argmax.cu b/ggml/src/ggml-cuda/argmax.cu index aab04eca7..5340eedc0 100644 --- a/ggml/src/ggml-cuda/argmax.cu +++ b/ggml/src/ggml-cuda/argmax.cu @@ -1,57 +1,69 @@ -#include "common.cuh" -#include "argmax.cuh" -#include "sum.cuh" - +#include #include -static __global__ void argmax_f32( - const float * x, int32_t * dst, const int64_t ncols, const int64_t nrows) { +#include "argmax.cuh" +#include "common.cuh" +#include "sum.cuh" - int argmax_thread = 0; - const int64_t row0 = (int64_t)blockIdx.x*WARP_SIZE; +static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __restrict__ dst, const int64_t ncols) { + const int64_t row = blockIdx.x; -#pragma unroll - for (int64_t row1 = 0; row1 < WARP_SIZE; ++row1) { - const int64_t row = row0 + row1; + float maxval = -FLT_MAX; + int argmax = -1; + const float * rowx = x + row * ncols; - if (row >= nrows) { - break; + for (int32_t col = threadIdx.x; col < ncols; col += blockDim.x) { + const float val = rowx[col]; + if (val > maxval) { + maxval = val; + argmax = col; } - - float maxval = -FLT_MAX; - int argmax = -1; - - for (int32_t col = threadIdx.x; col < ncols; col += WARP_SIZE) { - const float val = x[row*ncols + col]; - const int bigger = val > maxval; - const int not_bigger = bigger ^ 0x00000001; - - maxval = maxval*not_bigger + val*bigger; - argmax = argmax*not_bigger + col*bigger; - } - -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, mask, WARP_SIZE); - const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, mask, WARP_SIZE); - const int bigger = val > maxval; - const int not_bigger = bigger ^ 0x00000001; - - maxval = maxval*not_bigger + val*bigger; - argmax = argmax*not_bigger + col*bigger; - } - - const int store = row1 == threadIdx.x; - argmax_thread += store*argmax; } - const int row = row0 + threadIdx.x; - - if (row >= nrows) { - return; +#pragma unroll + for (int offset = 16; offset > 0; offset >>= 1) { + const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE); + const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE); + if (val > maxval) { + maxval = val; + argmax = col; + } } - dst[row] = argmax_thread; + const int n_warps = blockDim.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; + const int warp_id = threadIdx.x / WARP_SIZE; + if (n_warps > 1) { + constexpr int max_warps = 1024 / WARP_SIZE; + __shared__ float shared_maxval[max_warps]; + __shared__ int shared_argmax[max_warps]; + if (lane_id == 0) { + shared_maxval[warp_id] = maxval; + shared_argmax[warp_id] = argmax; + } + + __syncthreads(); + + if (warp_id == 0) { + if (lane_id < n_warps) { + maxval = shared_maxval[lane_id]; + argmax = shared_argmax[lane_id]; + } +#pragma unroll + for (int offset = 16; offset > 0; offset >>= 1) { + const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE); + const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE); + if (val > maxval) { + maxval = val; + argmax = col; + } + } + } + } + + if (warp_id == 0 && lane_id == 0) { + dst[row] = argmax; + } } void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -70,10 +82,10 @@ void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { cudaStream_t stream = ctx.stream(); - const int64_t num_blocks = (nrows + WARP_SIZE - 1) / WARP_SIZE; - - const dim3 blocks_dim(WARP_SIZE, 1, 1); + const int64_t num_blocks = nrows; + const int64_t num_threads = std::min(1024, (ne00 + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE); + const dim3 blocks_dim(num_threads, 1, 1); const dim3 blocks_num(num_blocks, 1, 1); - argmax_f32<<>>(src0_d, dst_d, ne00, nrows); + argmax_f32<<>>(src0_d, dst_d, ne00); } diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index e146c691c..b0dd16066 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -180,8 +180,8 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) { return __reduce_add_sync(0xffffffff, x); #else #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x += __shfl_xor_sync(0xffffffff, x, mask, 32); + for (int offset = 16; offset > 0; offset >>= 1) { + x += __shfl_xor_sync(0xffffffff, x, offset, 32); } return x; #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE @@ -189,17 +189,17 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) { static __device__ __forceinline__ float warp_reduce_sum(float x) { #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x += __shfl_xor_sync(0xffffffff, x, mask, 32); + for (int offset = 16; offset > 0; offset >>= 1) { + x += __shfl_xor_sync(0xffffffff, x, offset, 32); } return x; } static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32); - a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32); + for (int offset = 16; offset > 0; offset >>= 1) { + a.x += __shfl_xor_sync(0xffffffff, a.x, offset, 32); + a.y += __shfl_xor_sync(0xffffffff, a.y, offset, 32); } return a; } @@ -209,16 +209,16 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32); + for (int offset = 16; offset > 0; offset >>= 1) { + const half2 a_other = __shfl_xor_sync(0xffffffff, a, offset, 32); reinterpret_cast(a.x) += __low2half(a_other); reinterpret_cast(a.y) += __high2half(a_other); } return a; #else #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32)); + for (int offset = 16; offset > 0; offset >>= 1) { + a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, 32)); } return a; #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) @@ -231,8 +231,8 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { static __device__ __forceinline__ float warp_reduce_max(float x) { #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); + for (int offset = 16; offset > 0; offset >>= 1) { + x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, 32)); } return x; } @@ -275,8 +275,8 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); + for (int offset = 16; offset > 0; offset >>= 1) { + x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32)); } return x; #else diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index 45408ce86..1702e4ce2 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -69,8 +69,8 @@ static __global__ void quantize_mmq_q8_1( // Exchange max. abs. value between vals_per_scale/4 threads. #pragma unroll - for (int mask = vals_per_scale/8; mask > 0; mask >>= 1) { - amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, mask, WARP_SIZE)); + for (int offset = vals_per_scale/8; offset > 0; offset >>= 1) { + amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, offset, WARP_SIZE)); } float sum; @@ -79,8 +79,8 @@ static __global__ void quantize_mmq_q8_1( // Exchange calculate sum across vals_per_sum/4 threads. #pragma unroll - for (int mask = vals_per_sum/8; mask > 0; mask >>= 1) { - sum += __shfl_xor_sync(0xFFFFFFFF, sum, mask, WARP_SIZE); + for (int offset = vals_per_sum/8; offset > 0; offset >>= 1) { + sum += __shfl_xor_sync(0xFFFFFFFF, sum, offset, WARP_SIZE); } } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 719d75c70..78e7874de 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2255,6 +2255,7 @@ struct ggml_tensor * ggml_argmax( struct ggml_context * ctx, struct ggml_tensor * a) { GGML_ASSERT(ggml_is_matrix(a)); + GGML_ASSERT(a->ne[0] <= INT32_MAX); struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]); @@ -4138,6 +4139,7 @@ struct ggml_tensor * ggml_argsort( struct ggml_context * ctx, struct ggml_tensor * a, enum ggml_sort_order order) { + GGML_ASSERT(a->ne[0] <= INT32_MAX); struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne); ggml_set_op_params_i32(result, 0, (int32_t) order); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 37342c156..b2b570524 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1154,6 +1154,26 @@ struct test_argmax : public test_case { return out; } + void initialize_tensors(ggml_context * ctx) override { + std::random_device rd; + std::default_random_engine rng(rd()); + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + if (t->type == GGML_TYPE_F32) { + // initialize with unique values to avoid ties + for (int64_t r = 0; r < ggml_nrows(t); r++) { + std::vector data(t->ne[0]); + for (int i = 0; i < t->ne[0]; i++) { + data[i] = i; + } + std::shuffle(data.begin(), data.end(), rng); + ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float)); + } + } else { + init_tensor_uniform(t); + } + } + } + double max_nmse_err() override { return 0.0; } @@ -3440,6 +3460,11 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1)); test_cases.emplace_back(new test_argmax()); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 1, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {100, 10, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {2000, 10, 1, 1})); + test_cases.emplace_back(new test_count_equal()); for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1 @@ -3830,6 +3855,10 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {64, 64, 20, 1}, false, 1.0f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 64, 20, 1}, false, 1.0f, 0.0f)); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 10, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32000, 512, 1, 1})); + for (int bs : {1, 512}) { for (ggml_type type_a : all_types) { for (ggml_type type_b : {GGML_TYPE_F32}) { From c18610b4ee29ca056bb4f2d375a4ad1b16f44ef7 Mon Sep 17 00:00:00 2001 From: leo-pony Date: Fri, 22 Nov 2024 14:07:20 +0800 Subject: [PATCH 12/14] CANN: Support Ascend310P to accelerate F32 and F16 Model (#10216) * CANN Support Ascend310P to accelerate F32 and F16 Model * Add compile option soc type macro ASCEND_310P to ggml-cann lib * Remove unused code * Remove the ascend soc_type hard code compile option in CMakelist.txt --- ggml/src/ggml-cann/CMakeLists.txt | 29 ++++++++++++++++ ggml/src/ggml-cann/aclnn_ops.cpp | 18 ++++++++++ ggml/src/ggml-cann/kernels/CMakeLists.txt | 7 ++-- ggml/src/ggml-cann/kernels/dup.cpp | 32 +++++++++++++----- ggml/src/ggml-cann/kernels/get_row_f16.cpp | 37 +++++++++++++-------- ggml/src/ggml-cann/kernels/get_row_f32.cpp | 36 ++++++++++++-------- ggml/src/ggml-cann/kernels/get_row_q4_0.cpp | 5 ++- 7 files changed, 123 insertions(+), 41 deletions(-) diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt index c8e15c6d4..756200b89 100644 --- a/ggml/src/ggml-cann/CMakeLists.txt +++ b/ggml/src/ggml-cann/CMakeLists.txt @@ -3,6 +3,33 @@ if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOM message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}") endif() +# Auto-detech Soc type and Soc version, if detect failed, will abort build +set(SOC_VERSION "") +function(detect_ascend_soc_type SOC_VERSION) + execute_process( + COMMAND bash -c "npu-smi info|awk -F' ' 'NF > 0 && NR==7 {print $3}'" + OUTPUT_VARIABLE npu_info + RESULT_VARIABLE npu_result + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if("${npu_info}" STREQUAL "" OR ${npu_result}) + message(FATAL_ERROR "Auto-detech ascend soc type failed, please specify manually or check ascend device working normally.") + endif() + set(${SOC_VERSION} "Ascend${npu_info}" PARENT_SCOPE) +endfunction() + +if(NOT SOC_TYPE) + detect_ascend_soc_type(SOC_VERSION) + set(SOC_TYPE "${SOC_VERSION}") + message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}") +else() + string(TOLOWER ${SOC_TYPE} SOC_VERSION) +endif() + +# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P. +string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}") +set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}") + if (CANN_INSTALL_DIR) # Only Support Linux. if (NOT UNIX) @@ -39,6 +66,8 @@ if (CANN_INSTALL_DIR) target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS}) target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64) + target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") + message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}") message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}") else() diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index a4ec8418e..1f4ee986c 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2312,6 +2312,14 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { switch (src0->type) { case GGML_TYPE_F32: + { +#ifdef ASCEND_310P + // Special operation for get_row_f32 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes + if ((src0->ne[0] % 8) != 0) { + size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); + ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len)); + } +#endif aclrtlaunch_ascendc_get_row_f32( 24, ctx.stream(), src0->data, src1->data, dst->data, ((ggml_tensor*)src0->extra)->ne, @@ -2320,7 +2328,16 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, ((ggml_tensor*)dst->extra)->nb); break; + } case GGML_TYPE_F16: + { +#ifdef ASCEND_310P + // Special operation for get_row_f16 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes + if ((src0->ne[0] % 16) != 0) { + size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16 + ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len)); + } +#endif aclrtlaunch_ascendc_get_row_f16( 24, ctx.stream(), src0->data, src1->data, dst->data, ((ggml_tensor*)src0->extra)->ne, @@ -2329,6 +2346,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, ((ggml_tensor*)dst->extra)->nb); break; + } case GGML_TYPE_Q4_0: aclrtlaunch_ascendc_get_row_q4_0( 24, ctx.stream(), src0->data, src1->data, dst->data, diff --git a/ggml/src/ggml-cann/kernels/CMakeLists.txt b/ggml/src/ggml-cann/kernels/CMakeLists.txt index 5b4fef91b..6a4e17cce 100644 --- a/ggml/src/ggml-cann/kernels/CMakeLists.txt +++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt @@ -1,7 +1,3 @@ -if (NOT SOC_TYPE) - set (SOC_TYPE "Ascend910B3") -endif() - file(GLOB SRC_FILES get_row_f32.cpp get_row_f16.cpp @@ -13,7 +9,6 @@ file(GLOB SRC_FILES dup.cpp ) -string(TOLOWER ${SOC_TYPE} SOC_VERSION) set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR}) set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim") @@ -30,4 +25,6 @@ ascendc_library(ascendc_kernels STATIC ${SRC_FILES} ) +message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.") +ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP) diff --git a/ggml/src/ggml-cann/kernels/dup.cpp b/ggml/src/ggml-cann/kernels/dup.cpp index e2c651152..99f03e058 100644 --- a/ggml/src/ggml-cann/kernels/dup.cpp +++ b/ggml/src/ggml-cann/kernels/dup.cpp @@ -5,6 +5,7 @@ using namespace AscendC; #define BUFFER_NUM 2 +const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template template class DupByRows { @@ -19,6 +20,7 @@ class DupByRows { // Input has four dims. int64_t op_block_num = GetBlockNum(); int64_t op_block_idx = GetBlockIdx(); + assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM); // param num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3]; @@ -51,24 +53,36 @@ class DupByRows { __aicore__ inline void copy_in() { LocalTensor src_local = src_queue.AllocTensor(); - - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = num_elem * sizeof(SRC_T); - DataCopyPadExtParams padParams; - DataCopyPad(src_local, src_gm, dataCopyParams, padParams); - + const size_t elem_per_block = 32 / sizeof(SRC_T); + size_t tail = num_elem % elem_per_block; + size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem; + DataCopy(src_local, src_gm, cpy_elements_len); src_queue.EnQue(src_local); } __aicore__ inline void copy_out() { LocalTensor dst_local = dst_queue.DeQue(); - +#ifdef ASCEND_310P + const size_t elem_per_block = 32 / sizeof(DST_T); + size_t tail = num_elem % elem_per_block; + size_t len = num_elem & ~(elem_per_block - 1); + if (len > 0) { + DataCopy(dst_gm, dst_local, len); + } + if(tail != 0) { + for (size_t i = tail; i < elem_per_block; i++) { + dst_local[len + i].SetValue(0, 0); + } + SetAtomicAdd(); + DataCopy(dst_gm[len], dst_local[len], elem_per_block); + SetAtomicNone(); + } +#else DataCopyExtParams dataCopyParams; dataCopyParams.blockCount = 1; dataCopyParams.blockLen = num_elem * sizeof(DST_T); DataCopyPad(dst_gm, dst_local, dataCopyParams); - +#endif dst_queue.FreeTensor(dst_local); } diff --git a/ggml/src/ggml-cann/kernels/get_row_f16.cpp b/ggml/src/ggml-cann/kernels/get_row_f16.cpp index c704b5b2e..416b45104 100644 --- a/ggml/src/ggml-cann/kernels/get_row_f16.cpp +++ b/ggml/src/ggml-cann/kernels/get_row_f16.cpp @@ -14,7 +14,7 @@ class GET_ROW_F16 { int64_t *output_ne_ub, size_t *output_nb_ub) { // TODO, use template for F16/f32 int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); + op_block_idx = GetBlockIdx(); for (int i = 0; i < 4; i++) { input_ne[i] = input_ne_ub[i]; @@ -59,32 +59,42 @@ class GET_ROW_F16 { } __aicore__ inline void copy_in(uint32_t offset, size_t len) { + size_t origin_len = len; LocalTensor input_local = input_queue.AllocTensor(); - size_t tail = len % 32; - len = len & ~31; - DataCopy(input_local, input_gm[offset], len); + const size_t elem_per_block = 32 / sizeof(half); + size_t tail = len % elem_per_block; + len = len & ~(elem_per_block - 1); if(tail != 0) { - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = tail * sizeof(half); - DataCopyPadExtParams padParams; - DataCopyPad(input_local[len], input_gm[offset + len], - dataCopyParams, padParams); + len += elem_per_block; } + DataCopy(input_local, input_gm[offset], len); input_queue.EnQue(input_local); } __aicore__ inline void copy_out(uint32_t offset, size_t len) { LocalTensor output_local = output_queue.DeQue(); - size_t tail = len % 32; - len = len & ~31; - DataCopy(output_gm[offset], output_local, len); + const size_t elem_per_block = 32 / sizeof(float); + size_t tail = len % elem_per_block; + len = len & ~(elem_per_block - 1); + if (len > 0) { + DataCopy(output_gm[offset], output_local, len); + } + if(tail != 0) { +#ifdef ASCEND_310P + for (size_t i = tail; i < elem_per_block; i++) { + output_local[len + i].SetValue(0, 0); + } + SetAtomicAdd(); + DataCopy(output_gm[offset + len], output_local[len], elem_per_block); + SetAtomicNone(); +#else DataCopyExtParams dataCopyParams; dataCopyParams.blockCount = 1; dataCopyParams.blockLen = tail * sizeof(float); DataCopyPad(output_gm[offset + len], output_local[len], dataCopyParams); +#endif } output_queue.FreeTensor(output_local); } @@ -150,6 +160,7 @@ class GET_ROW_F16 { GlobalTensor output_gm; TQue input_queue; TQue output_queue; + int64_t op_block_idx; }; template diff --git a/ggml/src/ggml-cann/kernels/get_row_f32.cpp b/ggml/src/ggml-cann/kernels/get_row_f32.cpp index 9db080af3..02116905b 100644 --- a/ggml/src/ggml-cann/kernels/get_row_f32.cpp +++ b/ggml/src/ggml-cann/kernels/get_row_f32.cpp @@ -13,7 +13,7 @@ class GET_ROW_F32 { int64_t *indices_ne_ub, size_t *indices_nb_ub, int64_t *output_ne_ub, size_t *output_nb_ub) { int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); + op_block_idx = GetBlockIdx(); for (int i = 0; i < 4; i++) { input_ne[i] = input_ne_ub[i]; @@ -55,31 +55,40 @@ class GET_ROW_F32 { __aicore__ inline void copy_in(uint32_t offset, size_t len) { LocalTensor input_local = input_queue.AllocTensor(); - size_t tail = len % 32; - len = len & ~31; - DataCopy(input_local, input_gm[offset], len); + const size_t elem_per_block = 32 / sizeof(float); + size_t tail = len % elem_per_block; + len = len & ~(elem_per_block - 1); if(tail != 0) { - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = tail * sizeof(float); - DataCopyPadExtParams padParams; - DataCopyPad(input_local[len], input_gm[offset + len], - dataCopyParams, padParams); + len += elem_per_block; } + DataCopy(input_local, input_gm[offset], len); input_queue.EnQue(input_local); } __aicore__ inline void copy_out(uint32_t offset, size_t len) { LocalTensor output_local = output_queue.DeQue(); - size_t tail = len % 32; - len = len & ~31; - DataCopy(output_gm[offset], output_local, len); + const size_t elem_per_block = 32 / sizeof(float); + size_t tail = len % elem_per_block; + len = len & ~(elem_per_block - 1); + if (len > 0) { + DataCopy(output_gm[offset], output_local, len); + } + if(tail != 0) { +#ifdef ASCEND_310P + for (size_t i = tail; i < elem_per_block; i++) { + output_local[len + i].SetValue(0, 0); + } + SetAtomicAdd(); + DataCopy(output_gm[offset + len], output_local[len], elem_per_block); + SetAtomicNone(); +#else DataCopyExtParams dataCopyParams; dataCopyParams.blockCount = 1; dataCopyParams.blockLen = tail * sizeof(float); DataCopyPad(output_gm[offset + len], output_local[len], dataCopyParams); +#endif } output_queue.FreeTensor(output_local); } @@ -144,6 +153,7 @@ class GET_ROW_F32 { GlobalTensor output_gm; TQue input_queue; TQue output_queue; + int64_t op_block_idx; }; template diff --git a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp index a80bfeec2..377211096 100644 --- a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +++ b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp @@ -110,9 +110,12 @@ class GET_ROW_Q4_0 { LocalTensor output_local = output_queue.AllocTensor(); // TODO: cast more data to speed up. +#ifdef ASCEND_310P + // TODO: 310P support quantification +#else Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0); Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0); - +#endif // Only mul need compile by group. half scale = scale_gm.GetValue(scale_offset); From 599b3e0cd40432cd1975a8906f3db70bbe53b627 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 22 Nov 2024 08:32:40 +0100 Subject: [PATCH 13/14] GitHub: ask for more info in issue templates (#10426) * GitHub: ask for more info in issues [no ci] * refactor issue templates to be component-specific * more understandable issue description * add dropdown for llama.cpp module --- .github/ISSUE_TEMPLATE/01-bug-low.yml | 50 ---------- .../ISSUE_TEMPLATE/010-bug-compilation.yml | 73 ++++++++++++++ .github/ISSUE_TEMPLATE/011-bug-results.yml | 98 +++++++++++++++++++ .github/ISSUE_TEMPLATE/019-bug-misc.yml | 78 +++++++++++++++ .github/ISSUE_TEMPLATE/02-bug-medium.yml | 50 ---------- ...05-enhancement.yml => 020-enhancement.yml} | 2 +- .github/ISSUE_TEMPLATE/03-bug-high.yml | 50 ---------- .../{06-research.yml => 030-research.yml} | 2 +- .github/ISSUE_TEMPLATE/04-bug-critical.yml | 50 ---------- .../{07-refactor.yml => 040-refactor.yml} | 2 +- 10 files changed, 252 insertions(+), 203 deletions(-) delete mode 100644 .github/ISSUE_TEMPLATE/01-bug-low.yml create mode 100644 .github/ISSUE_TEMPLATE/010-bug-compilation.yml create mode 100644 .github/ISSUE_TEMPLATE/011-bug-results.yml create mode 100644 .github/ISSUE_TEMPLATE/019-bug-misc.yml delete mode 100644 .github/ISSUE_TEMPLATE/02-bug-medium.yml rename .github/ISSUE_TEMPLATE/{05-enhancement.yml => 020-enhancement.yml} (97%) delete mode 100644 .github/ISSUE_TEMPLATE/03-bug-high.yml rename .github/ISSUE_TEMPLATE/{06-research.yml => 030-research.yml} (97%) delete mode 100644 .github/ISSUE_TEMPLATE/04-bug-critical.yml rename .github/ISSUE_TEMPLATE/{07-refactor.yml => 040-refactor.yml} (95%) diff --git a/.github/ISSUE_TEMPLATE/01-bug-low.yml b/.github/ISSUE_TEMPLATE/01-bug-low.yml deleted file mode 100644 index 54785854f..000000000 --- a/.github/ISSUE_TEMPLATE/01-bug-low.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: Low Severity Bugs -description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches) -title: "Bug: " -labels: ["bug-unconfirmed", "low severity"] -body: - - type: markdown - attributes: - value: | - Thanks for taking the time to fill out this bug report! - Please include information about your system, the steps to reproduce the bug, - and the version of llama.cpp that you are using. - If possible, please provide a minimal code example that reproduces the bug. - - type: textarea - id: what-happened - attributes: - label: What happened? - description: Also tell us, what did you expect to happen? - placeholder: Tell us what you see! - validations: - required: true - - type: textarea - id: version - attributes: - label: Name and Version - description: Which executable and which version of our software are you running? (use `--version` to get a version string) - placeholder: | - $./llama-cli --version - version: 2999 (42b4109e) - built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: What operating system are you seeing the problem on? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: false - - type: textarea - id: logs - attributes: - label: Relevant log output - description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. - render: shell diff --git a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml new file mode 100644 index 000000000..550ee1b49 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml @@ -0,0 +1,73 @@ +name: Bug (compilation) +description: Something goes wrong when trying to compile llama.cpp. +title: "Compile bug: " +labels: ["bug-unconfirmed", "compilation"] +body: + - type: markdown + attributes: + value: > + Thanks for taking the time to fill out this bug report! + This issue template is intended for bug reports where the compilation of llama.cpp fails. + Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`. + If the compilation succeeds with ccache disabled you should be able to permanently fix the issue + by clearing `~/.cache/ccache` (on Linux). + - type: textarea + id: commit + attributes: + label: Git commit + description: Which commit are you trying to compile? + placeholder: | + $git rev-parse HEAD + 84a07a17b1b08cf2b9747c633a2372782848a27f + validations: + required: true + - type: dropdown + id: operating-system + attributes: + label: Which operating systems do you know to be affected? + multiple: true + options: + - Linux + - Mac + - Windows + - BSD + - Other? (Please let us know in description) + validations: + required: true + - type: dropdown + id: backends + attributes: + label: GGML backends + description: Which GGML backends do you know to be affected? + options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan] + multiple: true + - type: textarea + id: steps_to_reproduce + attributes: + label: Steps to Reproduce + description: > + Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it. + If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us. + placeholder: > + Here are the exact commands that I used: ... + validations: + required: true + - type: textarea + id: first_bad_commit + attributes: + label: First Bad Commit + description: > + If the bug was not present on an earlier version: when did it start appearing? + If possible, please do a git bisect and identify the exact commit that introduced the bug. + validations: + required: false + - type: textarea + id: logs + attributes: + label: Relevant log output + description: > + Please copy and paste any relevant log output, including the command that you entered and any generated text. + This will be automatically formatted into code, so no need for backticks. + render: shell + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml new file mode 100644 index 000000000..1adb162b7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml @@ -0,0 +1,98 @@ +name: Bug (model use) +description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module). +title: "Eval bug: " +labels: ["bug-unconfirmed", "model evaluation"] +body: + - type: markdown + attributes: + value: > + Thanks for taking the time to fill out this bug report! + This issue template is intended for bug reports where the model evaluation results + (i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation. + If you encountered the issue while using an external UI (e.g. ollama), + please reproduce your issue using one of the examples/binaries in this repository. + The `llama-cli` binary can be used for simple and reproducible model inference. + - type: textarea + id: version + attributes: + label: Name and Version + description: Which version of our software are you running? (use `--version` to get a version string) + placeholder: | + $./llama-cli --version + version: 2999 (42b4109e) + built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu + validations: + required: true + - type: dropdown + id: operating-system + attributes: + label: Which operating systems do you know to be affected? + multiple: true + options: + - Linux + - Mac + - Windows + - BSD + - Other? (Please let us know in description) + validations: + required: true + - type: dropdown + id: backends + attributes: + label: GGML backends + description: Which GGML backends do you know to be affected? + options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan] + multiple: true + - type: textarea + id: hardware + attributes: + label: Hardware + description: Which CPUs/GPUs are you using? + placeholder: > + e.g. Ryzen 5950X + 2x RTX 4090 + validations: + required: true + - type: textarea + id: model + attributes: + label: Model + description: > + Which model at which quantization were you using when encountering the bug? + If you downloaded a GGUF file off of Huggingface, please provide a link. + placeholder: > + e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M + validations: + required: false + - type: textarea + id: steps_to_reproduce + attributes: + label: Steps to Reproduce + description: > + Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it. + If you can narrow down the bug to specific hardware, compile flags, or command line arguments, + that information would be very much appreciated by us. + placeholder: > + e.g. when I run llama-cli with -ngl 99 I get garbled outputs. + When I use -ngl 0 it works correctly. + Here are the exact commands that I used: ... + validations: + required: true + - type: textarea + id: first_bad_commit + attributes: + label: First Bad Commit + description: > + If the bug was not present on an earlier version: when did it start appearing? + If possible, please do a git bisect and identify the exact commit that introduced the bug. + validations: + required: false + - type: textarea + id: logs + attributes: + label: Relevant log output + description: > + Please copy and paste any relevant log output, including the command that you entered and any generated text. + This will be automatically formatted into code, so no need for backticks. + render: shell + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml new file mode 100644 index 000000000..124cdee91 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml @@ -0,0 +1,78 @@ +name: Bug (misc.) +description: Something is not working the way it should (and it's not covered by any of the above cases). +title: "Misc. bug: " +labels: ["bug-unconfirmed"] +body: + - type: markdown + attributes: + value: > + Thanks for taking the time to fill out this bug report! + This issue template is intended for miscellaneous bugs that don't fit into any other category. + If you encountered the issue while using an external UI (e.g. ollama), + please reproduce your issue using one of the examples/binaries in this repository. + - type: textarea + id: version + attributes: + label: Name and Version + description: Which version of our software are you running? (use `--version` to get a version string) + placeholder: | + $./llama-cli --version + version: 2999 (42b4109e) + built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu + validations: + required: true + - type: dropdown + id: operating-system + attributes: + label: Which operating systems do you know to be affected? + multiple: true + options: + - Linux + - Mac + - Windows + - BSD + - Other? (Please let us know in description) + validations: + required: true + - type: dropdown + id: module + attributes: + label: Which llama.cpp modules do you know to be affected? + multiple: true + options: + - libllama (core library) + - llama-cli + - llama-server + - llama-bench + - llama-quantize + - Python/Bash scripts + - Other (Please specify in the next section) + validations: + required: true + - type: textarea + id: steps_to_reproduce + attributes: + label: Steps to Reproduce + description: > + Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it. + validations: + required: true + - type: textarea + id: first_bad_commit + attributes: + label: First Bad Commit + description: > + If the bug was not present on an earlier version: when did it start appearing? + If possible, please do a git bisect and identify the exact commit that introduced the bug. + validations: + required: false + - type: textarea + id: logs + attributes: + label: Relevant log output + description: > + Please copy and paste any relevant log output, including the command that you entered and any generated text. + This will be automatically formatted into code, so no need for backticks. + render: shell + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/02-bug-medium.yml b/.github/ISSUE_TEMPLATE/02-bug-medium.yml deleted file mode 100644 index a6285c6f0..000000000 --- a/.github/ISSUE_TEMPLATE/02-bug-medium.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: Medium Severity Bug -description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable) -title: "Bug: " -labels: ["bug-unconfirmed", "medium severity"] -body: - - type: markdown - attributes: - value: | - Thanks for taking the time to fill out this bug report! - Please include information about your system, the steps to reproduce the bug, - and the version of llama.cpp that you are using. - If possible, please provide a minimal code example that reproduces the bug. - - type: textarea - id: what-happened - attributes: - label: What happened? - description: Also tell us, what did you expect to happen? - placeholder: Tell us what you see! - validations: - required: true - - type: textarea - id: version - attributes: - label: Name and Version - description: Which executable and which version of our software are you running? (use `--version` to get a version string) - placeholder: | - $./llama-cli --version - version: 2999 (42b4109e) - built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: What operating system are you seeing the problem on? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: false - - type: textarea - id: logs - attributes: - label: Relevant log output - description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. - render: shell diff --git a/.github/ISSUE_TEMPLATE/05-enhancement.yml b/.github/ISSUE_TEMPLATE/020-enhancement.yml similarity index 97% rename from .github/ISSUE_TEMPLATE/05-enhancement.yml rename to .github/ISSUE_TEMPLATE/020-enhancement.yml index 58fca7318..02dd4f575 100644 --- a/.github/ISSUE_TEMPLATE/05-enhancement.yml +++ b/.github/ISSUE_TEMPLATE/020-enhancement.yml @@ -1,5 +1,5 @@ name: Enhancement -description: Used to request enhancements for llama.cpp +description: Used to request enhancements for llama.cpp. title: "Feature Request: " labels: ["enhancement"] body: diff --git a/.github/ISSUE_TEMPLATE/03-bug-high.yml b/.github/ISSUE_TEMPLATE/03-bug-high.yml deleted file mode 100644 index ff816b937..000000000 --- a/.github/ISSUE_TEMPLATE/03-bug-high.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: High Severity Bug -description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow) -title: "Bug: " -labels: ["bug-unconfirmed", "high severity"] -body: - - type: markdown - attributes: - value: | - Thanks for taking the time to fill out this bug report! - Please include information about your system, the steps to reproduce the bug, - and the version of llama.cpp that you are using. - If possible, please provide a minimal code example that reproduces the bug. - - type: textarea - id: what-happened - attributes: - label: What happened? - description: Also tell us, what did you expect to happen? - placeholder: Tell us what you see! - validations: - required: true - - type: textarea - id: version - attributes: - label: Name and Version - description: Which executable and which version of our software are you running? (use `--version` to get a version string) - placeholder: | - $./llama-cli --version - version: 2999 (42b4109e) - built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: What operating system are you seeing the problem on? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: false - - type: textarea - id: logs - attributes: - label: Relevant log output - description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. - render: shell diff --git a/.github/ISSUE_TEMPLATE/06-research.yml b/.github/ISSUE_TEMPLATE/030-research.yml similarity index 97% rename from .github/ISSUE_TEMPLATE/06-research.yml rename to .github/ISSUE_TEMPLATE/030-research.yml index 3ae4e9f8c..18975dbbf 100644 --- a/.github/ISSUE_TEMPLATE/06-research.yml +++ b/.github/ISSUE_TEMPLATE/030-research.yml @@ -1,5 +1,5 @@ name: Research -description: Track new technical research area +description: Track new technical research area. title: "Research: " labels: ["research 🔬"] body: diff --git a/.github/ISSUE_TEMPLATE/04-bug-critical.yml b/.github/ISSUE_TEMPLATE/04-bug-critical.yml deleted file mode 100644 index 7af42a80b..000000000 --- a/.github/ISSUE_TEMPLATE/04-bug-critical.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: Critical Severity Bug -description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss) -title: "Bug: " -labels: ["bug-unconfirmed", "critical severity"] -body: - - type: markdown - attributes: - value: | - Thanks for taking the time to fill out this bug report! - Please include information about your system, the steps to reproduce the bug, - and the version of llama.cpp that you are using. - If possible, please provide a minimal code example that reproduces the bug. - - type: textarea - id: what-happened - attributes: - label: What happened? - description: Also tell us, what did you expect to happen? - placeholder: Tell us what you see! - validations: - required: true - - type: textarea - id: version - attributes: - label: Name and Version - description: Which executable and which version of our software are you running? (use `--version` to get a version string) - placeholder: | - $./llama-cli --version - version: 2999 (42b4109e) - built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: What operating system are you seeing the problem on? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: false - - type: textarea - id: logs - attributes: - label: Relevant log output - description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. - render: shell diff --git a/.github/ISSUE_TEMPLATE/07-refactor.yml b/.github/ISSUE_TEMPLATE/040-refactor.yml similarity index 95% rename from .github/ISSUE_TEMPLATE/07-refactor.yml rename to .github/ISSUE_TEMPLATE/040-refactor.yml index 3a68d3d53..b6e6ab36d 100644 --- a/.github/ISSUE_TEMPLATE/07-refactor.yml +++ b/.github/ISSUE_TEMPLATE/040-refactor.yml @@ -1,5 +1,5 @@ name: Refactor (Maintainers) -description: Used to track refactoring opportunities +description: Used to track refactoring opportunities. title: "Refactor: " labels: ["refactor"] body: From 6dfcfef0787e9902df29f510b63621f60a09a50b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=95=AD=E6=BE=A7=E9=82=A6?= <45505768+shou692199@users.noreply.github.com> Date: Fri, 22 Nov 2024 17:44:08 +0800 Subject: [PATCH 14/14] ci: Update oneAPI runtime dll packaging (#10428) This is the minimum runtime dll dependencies for oneAPI 2025.0 --- .github/workflows/build.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6ef0770f3..572f91643 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -986,13 +986,14 @@ jobs: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin" - cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin