From fc5bb53b324f19a22abf20ebd0bd4882547d8c37 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 28 Jun 2023 20:18:55 +0200 Subject: [PATCH] Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel --- Makefile | 1 + ggml-vulkan.cpp | 581 +++++++++++++++---------------------- vk_shaders/f16_to_f32.glsl | 21 ++ vk_shaders/matmul_f16.glsl | 100 +++++++ vk_shaders/matmul_f16.spv | Bin 2144 -> 0 bytes 5 files changed, 362 insertions(+), 341 deletions(-) create mode 100644 vk_shaders/f16_to_f32.glsl create mode 100644 vk_shaders/matmul_f16.glsl delete mode 100644 vk_shaders/matmul_f16.spv diff --git a/Makefile b/Makefile index d176453a2..539504be8 100644 --- a/Makefile +++ b/Makefile @@ -222,6 +222,7 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h $(CXX) $(CXXFLAGS) -c $< -o $@ glslc -fshader-stage=compute --target-env=vulkan1.2 -O vk_shaders/matmul_f32.glsl -o vk_shaders/matmul_f32.spv glslc -fshader-stage=compute --target-env=vulkan1.2 -O vk_shaders/matmul_f16.glsl -o vk_shaders/matmul_f16.spv + glslc -fshader-stage=compute --target-env=vulkan1.2 -O vk_shaders/f16_to_f32.glsl -o vk_shaders/f16_to_f32.spv endif ifneq ($(filter aarch64%,$(UNAME_M)),) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 9a6cf68dc..3f5d4c85a 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -58,6 +58,9 @@ struct vk_pipeline { vk::DescriptorSetLayout dsl; vk::PipelineLayout layout; vk::Pipeline pipeline; + uint32_t push_constant_size; + uint32_t parameter_count; + std::array wg_denoms; }; vk::Instance vk_instance; @@ -68,6 +71,7 @@ vk::Device vk_device; vk::CommandPool vk_command_pool_compute, vk_command_pool_transfer; VmaAllocator vk_allocator; vk_pipeline vk_pipeline_matmul_f32, vk_pipeline_matmul_f16; +vk_pipeline vk_pipeline_f16_to_f32; VmaAllocation vk_buffer_qa_alloc, vk_buffer_a_alloc, vk_buffer_b_alloc, vk_buffer_c_alloc; vk::Buffer vk_buffer_qa, vk_buffer_a, vk_buffer_b, vk_buffer_c; @@ -75,9 +79,22 @@ bool vk_fp16_support = false; static std::vector> vk_buf_list; -static vk_pipeline ggml_vk_create_pipeline(const std::string& path, const std::string& entrypoint, const std::vector& dsl_binding, const vk::PushConstantRange& pcr) { +static vk::CommandBuffer ggml_vk_cmd_buffer_create(vk::CommandPool& pool) { + vk::CommandBufferAllocateInfo command_buffer_alloc_info( + pool, + vk::CommandBufferLevel::ePrimary, + 1); + const std::vector cmd_buffers = vk_device.allocateCommandBuffers(command_buffer_alloc_info); + return cmd_buffers.front(); +} + +static vk_pipeline ggml_vk_create_pipeline(const std::string& path, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_count, std::array wg_denoms) { vk_pipeline pipeline; + pipeline.parameter_count = parameter_count; + pipeline.push_constant_size = push_constant_count * sizeof(int); + pipeline.wg_denoms = wg_denoms; + std::vector matmul_shader_contents; if (std::ifstream shader_file{ path, std::ios::binary | std::ios::ate }) { const size_t file_size = shader_file.tellg(); @@ -96,6 +113,17 @@ static vk_pipeline ggml_vk_create_pipeline(const std::string& path, const std::s ); vk::ShaderModule shader_module = vk_device.createShaderModule(shader_module_create_info); + std::vector dsl_binding; + for (uint32_t i = 0; i < parameter_count; i++) { + dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute}); + } + + vk::PushConstantRange pcr( + vk::ShaderStageFlagBits::eCompute, + 0, + pipeline.push_constant_size + ); + vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info( vk::DescriptorSetLayoutCreateFlags(), dsl_binding); @@ -119,6 +147,55 @@ static vk_pipeline ggml_vk_create_pipeline(const std::string& path, const std::s return pipeline; } +static void ggml_vk_dispatch_pipeline(vk_pipeline& pipeline, std::vector buffers, std::vector push_constants, std::array elements, vk::Fence& fence) { + vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline.parameter_count); + vk::DescriptorPoolCreateInfo descriptor_pool_create_info(vk::DescriptorPoolCreateFlags(), 1, descriptor_pool_size); + vk::DescriptorPool descriptor_pool = vk_device.createDescriptorPool(descriptor_pool_create_info); + + vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(descriptor_pool, 1, &pipeline.dsl); + const std::vector descriptor_sets = vk_device.allocateDescriptorSets(descriptor_set_alloc_info); + vk::DescriptorSet descriptor_set = descriptor_sets.front(); + + std::vector descriptor_buffer_infos; + std::vector write_descriptor_sets; + for (uint32_t i = 0; i < pipeline.parameter_count; i++) { + descriptor_buffer_infos.push_back({buffers[i]->buffer, 0, buffers[i]->size}); + } + for (uint32_t i = 0; i < pipeline.parameter_count; i++) { + write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]}); + } + + vk_device.updateDescriptorSets(write_descriptor_sets, {}); + + vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_command_pool_compute); + + vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); + cmd_buffer.begin(cmd_buffer_begin_info); + cmd_buffer.pushConstants(pipeline.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constants); + cmd_buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline.pipeline); + cmd_buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, + pipeline.layout, + 0, + { descriptor_set }, + + {}); + cmd_buffer.dispatch(CEIL_DIV(elements[0], pipeline.wg_denoms[0]), CEIL_DIV(elements[1], pipeline.wg_denoms[1]), CEIL_DIV(elements[2], pipeline.wg_denoms[2])); + cmd_buffer.end(); + + vk::Queue queue = vk_device.getQueue(vk_compute_queue_family_index, 0); + + vk::SubmitInfo submit_info(0, + nullptr, + nullptr, + 1, + &cmd_buffer); + + // Wait for transfers to finish + vk_device.getQueue(vk_transfer_queue_family_index, 0).waitIdle(); + + queue.submit({ submit_info }, fence); +} + void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k); void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k); @@ -128,9 +205,7 @@ void ggml_vk_init(void) { vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION }; const std::vector layers = { -#ifdef GGML_DEBUG "VK_LAYER_KHRONOS_validation", -#endif }; vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers.size(), layers.data()); vk_instance = vk::createInstance(instance_create_info); @@ -193,19 +268,10 @@ void ggml_vk_init(void) { vmaCreateAllocator(&allocator_info, &vk_allocator); // Shaders - std::vector dsl_binding = { - {0, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute}, - {1, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute}, - {2, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute} - }; + vk_pipeline_matmul_f32 = ggml_vk_create_pipeline("vk_shaders/matmul_f32.spv", "main", 3, 6, {128, 128, 1}); + vk_pipeline_matmul_f16 = ggml_vk_create_pipeline("vk_shaders/matmul_f16.spv", "main", 3, 6, {128, 128, 1}); - vk::PushConstantRange pcr( - vk::ShaderStageFlagBits::eCompute, - 0, - 6 * sizeof(int) - ); - vk_pipeline_matmul_f32 = ggml_vk_create_pipeline("vk_shaders/matmul_f32.spv", "main", dsl_binding, pcr); - vk_pipeline_matmul_f16 = ggml_vk_create_pipeline("vk_shaders/matmul_f16.spv", "main", dsl_binding, pcr); + vk_pipeline_f16_to_f32 = ggml_vk_create_pipeline("vk_shaders/f16_to_f32.spv", "main", 2, 1, {32, 1, 1}); // Command pools vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(), vk_compute_queue_family_index); @@ -214,14 +280,35 @@ void ggml_vk_init(void) { vk::CommandPoolCreateInfo command_pool_create_info_transfer(vk::CommandPoolCreateFlags(), vk_transfer_queue_family_index); vk_command_pool_transfer = vk_device.createCommandPool(command_pool_create_info_transfer); - // for (size_t m = 1; m < 10; m++) { - // for (size_t n = 1; n < 10; n++) { - // for (size_t k = 1; k < 10; k++) { - // ggml_vk_test_matmul_f32(m * 128, n * 128, k * 128); - // ggml_vk_test_matmul_f16(m * 128, n * 128, k * 128); - // } - // } - // } +#ifdef VK_CHK_KERNEL + for (size_t m = 1; m < 10; m++) { + for (size_t n = 1; n < 10; n++) { + for (size_t k = 1; k < 10; k++) { + ggml_vk_test_matmul_f32(m * 128, n * 128, k * 128); + ggml_vk_test_matmul_f16(m * 128, n * 128, k * 128); + } + } + } +#endif +} + +static vk_pipeline* ggml_get_to_fp32_vk(ggml_type type) { + switch (type) { + // case GGML_TYPE_Q4_0: + // return &dequantize_row_q4_0_cl; + // case GGML_TYPE_Q4_1: + // return &dequantize_row_q4_1_cl; + // case GGML_TYPE_Q5_0: + // return &dequantize_row_q5_0_cl; + // case GGML_TYPE_Q5_1: + // return &dequantize_row_q5_1_cl; + // case GGML_TYPE_Q8_0: + // return &dequantize_row_q8_0_cl; + case GGML_TYPE_F16: + return &vk_pipeline_f16_to_f32; + default: + return nullptr; + } } // buffer pool for vulkan @@ -381,15 +468,6 @@ void ggml_vk_host_free(void* ptr) { ggml_vk_destroy_buffer(*buf); } -static vk::CommandBuffer ggml_vk_cmd_buffer_create(vk::CommandPool& pool) { - vk::CommandBufferAllocateInfo command_buffer_alloc_info( - pool, - vk::CommandBufferLevel::ePrimary, - 1); - const std::vector cmd_buffers = vk_device.allocateCommandBuffers(command_buffer_alloc_info); - return cmd_buffers.front(); -} - static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src, size_t size) { VkMemoryPropertyFlags mem_prop_flags; vmaGetAllocationMemoryProperties(vk_allocator, dst->allocation, &mem_prop_flags); @@ -616,30 +694,6 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr ggml_vk_pool_malloc(sizeof(float) * y_ne, &d_Y, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT); ggml_vk_pool_malloc(sizeof(float) * d_ne, &d_D, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT); - vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, 3); - vk::DescriptorPoolCreateInfo descriptor_pool_create_info(vk::DescriptorPoolCreateFlags(), 1, descriptor_pool_size); - vk::DescriptorPool descriptor_pool = vk_device.createDescriptorPool(descriptor_pool_create_info); - - vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(descriptor_pool, 1, &vk_pipeline_matmul_f32.dsl); - const std::vector descriptor_sets = vk_device.allocateDescriptorSets(descriptor_set_alloc_info); - vk::DescriptorSet descriptor_set = descriptor_sets.front(); - vk::DescriptorBufferInfo d_X_buffer_info(d_X.buffer, 0, sizeof(float) * x_ne); - vk::DescriptorBufferInfo d_Y_buffer_info(d_Y.buffer, 0, sizeof(float) * y_ne); - vk::DescriptorBufferInfo d_D_buffer_info(d_D.buffer, 0, sizeof(float) * d_ne); - - const std::vector write_descriptor_sets = { - {descriptor_set, 0, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_X_buffer_info}, - {descriptor_set, 1, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_Y_buffer_info}, - {descriptor_set, 2, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_D_buffer_info}, - }; - vk_device.updateDescriptorSets(write_descriptor_sets, {}); - - std::array push_constants = { (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01 }; - assert( ( sizeof( push_constants ) <= vk_physical_device.getProperties().limits.maxPushConstantsSize ) && "Too many push constants" ); - - vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_command_pool_compute); - vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo()); - for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { // copy data to device @@ -649,34 +703,16 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02); // compute + vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo()); #ifdef VK_CHK_KERNEL auto begin = std::chrono::high_resolution_clock::now(); #endif - vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); - cmd_buffer.begin(cmd_buffer_begin_info); - cmd_buffer.pushConstants(vk_pipeline_matmul_f32.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constants); - cmd_buffer.bindPipeline(vk::PipelineBindPoint::eCompute, vk_pipeline_matmul_f32.pipeline); - cmd_buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, - vk_pipeline_matmul_f32.layout, - 0, - { descriptor_set }, - {}); - cmd_buffer.dispatch(CEIL_DIV(ne01, 128), CEIL_DIV(ne11, 128), 1); - cmd_buffer.end(); - - vk::Queue queue = vk_device.getQueue(vk_compute_queue_family_index, 0); - - vk::SubmitInfo submit_info(0, - nullptr, - nullptr, - 1, - &cmd_buffer); - // Wait for transfers to finish vk_device.getQueue(vk_transfer_queue_family_index, 0).waitIdle(); - queue.submit({ submit_info }, fence); + ggml_vk_dispatch_pipeline(vk_pipeline_matmul_f32, {&d_X, &d_Y, &d_D}, { (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01 }, { (uint32_t)ne01, (uint32_t)ne11, 1}, fence); + vk_device.waitForFences({ fence }, true, uint64_t(-1)); @@ -687,6 +723,8 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr std::cout << "m=" << ne01 << " n=" << ne11 << " k=" << ne10 << " matmul " << std::chrono::duration_cast(end-begin).count() / 1000.0 << "ms" << std::endl; #endif + vk_device.destroyFence(fence); + // copy dst to host float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne); @@ -714,8 +752,6 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr } } - vk_device.destroyFence(fence); - if (src0->backend != GGML_BACKEND_GPU) { ggml_vk_pool_free(d_X); } @@ -763,30 +799,6 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr bool src0_cont_rows = nb00 == sizeof(float); bool src0_cont_cols = (size_t)nb01 == ne01*sizeof(float); - vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, 3); - vk::DescriptorPoolCreateInfo descriptor_pool_create_info(vk::DescriptorPoolCreateFlags(), 1, descriptor_pool_size); - vk::DescriptorPool descriptor_pool = vk_device.createDescriptorPool(descriptor_pool_create_info); - - vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(descriptor_pool, 1, &vk_pipeline_matmul_f16.dsl); - const std::vector descriptor_sets = vk_device.allocateDescriptorSets(descriptor_set_alloc_info); - vk::DescriptorSet descriptor_set = descriptor_sets.front(); - vk::DescriptorBufferInfo d_X_buffer_info(d_X.buffer, 0, sizeof(float) * x_ne); - vk::DescriptorBufferInfo d_Y_buffer_info(d_Y.buffer, 0, sizeof(float) * y_ne); - vk::DescriptorBufferInfo d_D_buffer_info(d_D.buffer, 0, sizeof(float) * d_ne); - - const std::vector write_descriptor_sets = { - {descriptor_set, 0, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_X_buffer_info}, - {descriptor_set, 1, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_Y_buffer_info}, - {descriptor_set, 2, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_D_buffer_info}, - }; - vk_device.updateDescriptorSets(write_descriptor_sets, {}); - - std::array push_constants = { (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01 }; - assert( ( sizeof( push_constants ) <= vk_physical_device.getProperties().limits.maxPushConstantsSize ) && "Too many push constants" ); - - vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_command_pool_compute); - vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo()); - for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { // copy data to device @@ -818,34 +830,12 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr ggml_vk_buffer_write(&d_X, 0, tmp, sizeof(float) * x_ne); // compute + vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo()); #ifdef VK_CHK_KERNEL auto begin = std::chrono::high_resolution_clock::now(); #endif - vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); - cmd_buffer.begin(cmd_buffer_begin_info); - cmd_buffer.pushConstants(vk_pipeline_matmul_f32.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constants); - cmd_buffer.bindPipeline(vk::PipelineBindPoint::eCompute, vk_pipeline_matmul_f32.pipeline); - cmd_buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, - vk_pipeline_matmul_f32.layout, - 0, - { descriptor_set }, - {}); - cmd_buffer.dispatch(CEIL_DIV(ne01, 128), CEIL_DIV(ne11, 128), 1); - cmd_buffer.end(); - - vk::Queue queue = vk_device.getQueue(vk_compute_queue_family_index, 0); - - vk::SubmitInfo submit_info(0, - nullptr, - nullptr, - 1, - &cmd_buffer); - - // Wait for transfers to finish - vk_device.getQueue(vk_transfer_queue_family_index, 0).waitIdle(); - - queue.submit({ submit_info }, fence); + ggml_vk_dispatch_pipeline(vk_pipeline_matmul_f32, {&d_X, &d_Y, &d_D}, { (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01 }, { (uint32_t)ne01, (uint32_t)ne11, 1}, fence); vk_device.waitForFences({ fence }, true, uint64_t(-1)); @@ -856,9 +846,11 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr std::cout << "m=" << ne01 << " n=" << ne11 << " k=" << ne10 << " matmul " << std::chrono::duration_cast(end-begin).count() / 1000.0 << "ms" << std::endl; #endif + vk_device.destroyFence(fence); + // copy dst to host float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - ggml_vk_buffer_read(&d_D, 0, tmp, sizeof(float) * d_ne); + ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne); #ifdef VK_CHK_KERNEL for (size_t i = 0; i < d_ne; i++) { @@ -873,8 +865,6 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr } } - vk_device.destroyFence(fence); - if (src0->backend != GGML_BACKEND_GPU) { ggml_vk_pool_free(d_X); } @@ -883,127 +873,117 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr } static void ggml_vk_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - assert(false); -// const int64_t ne00 = src0->ne[0]; -// const int64_t ne01 = src0->ne[1]; -// const int64_t ne02 = src0->ne[2]; -// const int64_t ne03 = src0->ne[3]; -// -// const int64_t ne10 = src1->ne[0]; -// const int64_t ne11 = src1->ne[1]; -// -// const int nb2 = dst->nb[2]; -// const int nb3 = dst->nb[3]; -// const ggml_type type = src0->type; -// const bool mul_mat_vec = ne11 == 1; -// -// const float alpha = 1.0f; -// const float beta = 0.0f; -// const int x_ne = ne01 * ne00; -// const int y_ne = ne11 * ne10; -// const int d_ne = ne11 * ne01; -// const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type); -// -// size_t x_size; -// size_t y_size; -// size_t d_size; -// size_t q_size; -// vk_buffer d_X; -// if (!mul_mat_vec) { -// d_X = ggml_vk_pool_malloc(sizeof(float) * x_ne, &x_size); -// } -// vk_buffer d_Y = ggml_vk_pool_malloc(sizeof(float) * y_ne, &y_size); -// vk_buffer d_D = ggml_vk_pool_malloc(sizeof(float) * d_ne, &d_size); -// vk_buffer d_Q; -// if (src0->backend == GGML_BACKEND_CPU) { -// d_Q = ggml_vk_pool_malloc(q_sz, &q_size); -// } -// -// vk_kernel* to_fp32_vk = ggml_get_to_fp32_vk(type); -// vk_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_vk(type); -// GGML_ASSERT(to_fp32_vk != nullptr); -// -// size_t ev_idx = 0; -// std::vector events; -// -// for (int64_t i03 = 0; i03 < ne03; i03++) { -// for (int64_t i02 = 0; i02 < ne02; i02++) { -// // copy src0 to device if necessary -// if (src0->backend == GGML_BACKEND_CPU) { -// events.emplace_back(); -// VK_CHECK(ggml_vk_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++)); -// } else if (src0->backend == GGML_BACKEND_GPU) { -// d_Q = (vk_buffer) src0->data; -// } else { -// GGML_ASSERT(false); -// } -// if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel -// // copy src1 to device -// events.emplace_back(); -// VK_CHECK(ggml_vk_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++)); -// -// // compute -// const size_t global = ne01 * VK_DMMV_BLOCK_SIZE; -// const size_t local = VK_DMMV_BLOCK_SIZE; -// const vk_int ncols = ne00; -// events.emplace_back(); -// VK_CHECK(vkSetKernelArg(*dmmv, 0, sizeof(vk_buffer), &d_Q)); -// VK_CHECK(vkSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL)); -// VK_CHECK(vkSetKernelArg(*dmmv, 2, sizeof(vk_buffer), &d_Y)); -// VK_CHECK(vkSetKernelArg(*dmmv, 3, sizeof(vk_buffer), &d_D)); -// VK_CHECK(vkSetKernelArg(*dmmv, 4, sizeof(vk_int), &ncols)); -// VK_CHECK(vkEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++)); -// } else { // general dequantization kernel + VKBlast matrix matrix multiplication -// // convert src0 to fp32 on device -// const size_t global = x_ne; -// VK_CHECK(vkSetKernelArg(*to_fp32_vk, 0, sizeof(vk_buffer), &d_Q)); -// VK_CHECK(vkSetKernelArg(*to_fp32_vk, 1, sizeof(vk_buffer), &d_X)); -// VK_CHECK(vkEnqueueNDRangeKernel(queue, *to_fp32_vk, 1, NULL, &global, NULL, events.size(), !events.empty() ? events.data() : NULL, NULL)); -// -// // copy src1 to device -// VK_CHECK(ggml_vk_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL)); -// -// events.emplace_back(); -// -// // wait for conversion -// VK_CHECK(vkFinish(queue)); -// -// // compute -// vkblast::StatusCode status = vkblast::Gemm(vkblast::Layout::kColMajor, -// vkblast::Transpose::kYes, vkblast::Transpose::kNo, -// ne01, ne11, ne10, -// alpha, -// d_X, 0, ne00, -// d_Y, 0, ne10, -// beta, -// d_D, 0, ne01, -// &queue, events.data() + ev_idx++); -// -// if (status != vkblast::StatusCode::kSuccess) { -// GGML_ASSERT(false); -// } -// } -// -// // copy dst to host -// float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); -// VK_CHECK(vkEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL)); -// for (auto *event : events) { -// vkReleaseEvent(event); -// } -// -// ev_idx = 0; -// events.vkear(); -// } -// } -// -// if (!mul_mat_vec) { -// ggml_vk_pool_free(d_X, x_size); -// } -// ggml_vk_pool_free(d_Y, y_size); -// ggml_vk_pool_free(d_D, d_size); -// if (src0->backend == GGML_BACKEND_CPU) { -// ggml_vk_pool_free(d_Q, q_size); -// } + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + const ggml_type type = src0->type; + const bool mul_mat_vec = false; // ne11 == 1; + + const float alpha = 1.0f; + const float beta = 0.0f; + const int x_ne = ne01 * ne00; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type); + + vk_buffer d_X; + vk_buffer d_Y; + vk_buffer d_D; + if (!mul_mat_vec) { + ggml_vk_pool_malloc(sizeof(float) * x_ne, &d_X, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT); + } + ggml_vk_pool_malloc(sizeof(float) * y_ne, &d_Y, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT); + ggml_vk_pool_malloc(sizeof(float) * d_ne, &d_D, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT); + vk_buffer d_Q; + if (src0->backend == GGML_BACKEND_CPU) { + ggml_vk_pool_malloc(q_sz, &d_Q, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT); + } + + vk_pipeline* to_fp32_vk = ggml_get_to_fp32_vk(type); + // vk_pipeline* dmmv = ggml_get_dequantize_mul_mat_vec_vk(type); + GGML_ASSERT(to_fp32_vk != nullptr); + + size_t ev_idx = 0; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + // copy src0 to device if necessary + if (src0->backend == GGML_BACKEND_CPU) { + ggml_vk_h2d_tensor_2d(&d_Q, 0, src0, i03, i02); + } else if (src0->backend == GGML_BACKEND_GPU) { + d_Q = *(vk_buffer *) src0->data; + } else { + GGML_ASSERT(false); + } + if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel + GGML_ASSERT(false); + // // copy src1 to device + // events.emplace_back(); + // VK_CHECK(ggml_vk_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++)); + + // // compute + // const size_t global = ne01 * VK_DMMV_BLOCK_SIZE; + // const size_t local = VK_DMMV_BLOCK_SIZE; + // const vk_int ncols = ne00; + // events.emplace_back(); + // VK_CHECK(vkSetKernelArg(*dmmv, 0, sizeof(vk_buffer), &d_Q)); + // VK_CHECK(vkSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL)); + // VK_CHECK(vkSetKernelArg(*dmmv, 2, sizeof(vk_buffer), &d_Y)); + // VK_CHECK(vkSetKernelArg(*dmmv, 3, sizeof(vk_buffer), &d_D)); + // VK_CHECK(vkSetKernelArg(*dmmv, 4, sizeof(vk_int), &ncols)); + // VK_CHECK(vkEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++)); + } else { // general dequantization kernel + VK matrix matrix multiplication + // convert src0 to fp32 on device + // Wait for transfers to finish + vk_device.getQueue(vk_transfer_queue_family_index, 0).waitIdle(); + + vk::Fence fence = vk_device.createFence(vk::FenceCreateFlags{}); + + ggml_vk_dispatch_pipeline(*to_fp32_vk, {&d_Q, &d_X}, { (int)x_ne }, { (uint32_t)x_ne, 1, 1}, fence); + + // copy src1 to device + ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02); + + // wait for conversion + vk_device.waitForFences({ fence }, + true, + uint64_t(-1)); + + vk_device.destroyFence(fence); + + // compute + fence = vk_device.createFence(vk::FenceCreateFlags{}); + + ggml_vk_dispatch_pipeline(vk_pipeline_matmul_f32, {&d_X, &d_Y, &d_D}, { (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01 }, { (uint32_t)ne01, (uint32_t)ne11, 1}, fence); + + vk_device.waitForFences({ fence }, + true, + uint64_t(-1)); + + vk_device.destroyFence(fence); + } + + // copy dst to host + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne); + } + } + + if (!mul_mat_vec) { + ggml_vk_pool_free(d_X); + } + ggml_vk_pool_free(d_Y); + ggml_vk_pool_free(d_D); + if (src0->backend == GGML_BACKEND_CPU) { + ggml_vk_pool_free(d_Q); + } } @@ -1017,7 +997,7 @@ bool ggml_vk_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens if ((src0->type == GGML_TYPE_F32 /*|| src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)*/) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && - ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) { + ((ne0 >= 128 && ne1 >= 32 && ne10 >= 128) || src0->backend == GGML_BACKEND_GPU)) { return true; } @@ -1086,24 +1066,6 @@ void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k) { ggml_vk_pool_malloc(sizeof(float) * y_ne, &d_Y, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT); ggml_vk_pool_malloc(sizeof(float) * d_ne, &d_D, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT); - vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, 3); - vk::DescriptorPoolCreateInfo descriptor_pool_create_info(vk::DescriptorPoolCreateFlags(), 1, descriptor_pool_size); - vk::DescriptorPool descriptor_pool = vk_device.createDescriptorPool(descriptor_pool_create_info); - - vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(descriptor_pool, 1, &vk_pipeline_matmul_f32.dsl); - const std::vector descriptor_sets = vk_device.allocateDescriptorSets(descriptor_set_alloc_info); - vk::DescriptorSet descriptor_set = descriptor_sets.front(); - vk::DescriptorBufferInfo d_X_buffer_info(d_X.buffer, 0, sizeof(float) * x_ne); - vk::DescriptorBufferInfo d_Y_buffer_info(d_Y.buffer, 0, sizeof(float) * y_ne); - vk::DescriptorBufferInfo d_D_buffer_info(d_D.buffer, 0, sizeof(float) * d_ne); - - const std::vector write_descriptor_sets = { - {descriptor_set, 0, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_X_buffer_info}, - {descriptor_set, 1, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_Y_buffer_info}, - {descriptor_set, 2, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_D_buffer_info}, - }; - vk_device.updateDescriptorSets(write_descriptor_sets, {}); - std::array push_constants = { (int)m, (int)n, (int)k, (int)k, (int)k, (int)m }; assert( ( sizeof( push_constants ) <= vk_physical_device.getProperties().limits.maxPushConstantsSize ) && "Too many push constants" ); @@ -1121,36 +1083,15 @@ void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k) { ggml_vk_buffer_write(&d_X, 0, x, sizeof(float) * x_ne); ggml_vk_buffer_write(&d_Y, 0, y, sizeof(float) * y_ne); - vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_command_pool_compute); - vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo()); - - // compute - auto begin = std::chrono::high_resolution_clock::now(); - - vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); - cmd_buffer.begin(cmd_buffer_begin_info); - cmd_buffer.pushConstants(vk_pipeline_matmul_f32.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constants); - cmd_buffer.bindPipeline(vk::PipelineBindPoint::eCompute, vk_pipeline_matmul_f32.pipeline); - cmd_buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, - vk_pipeline_matmul_f32.layout, - 0, - { descriptor_set }, - {}); - cmd_buffer.dispatch(CEIL_DIV(m, 128), CEIL_DIV(n, 128), 1); - cmd_buffer.end(); - - vk::Queue queue = vk_device.getQueue(vk_compute_queue_family_index, 0); - - vk::SubmitInfo submit_info(0, - nullptr, - nullptr, - 1, - &cmd_buffer); - // Wait for transfers to finish vk_device.getQueue(vk_transfer_queue_family_index, 0).waitIdle(); - queue.submit({ submit_info }, fence); + vk::Fence fence = vk_device.createFence(vk::FenceCreateFlags{}); + + auto begin = std::chrono::high_resolution_clock::now(); + + ggml_vk_dispatch_pipeline(vk_pipeline_matmul_f32, {&d_X, &d_Y, &d_D}, { (int)m, (int)n, (int)k, (int)k, (int)k, (int)m }, { (uint32_t)m, (uint32_t)n, 1}, fence); + vk_device.waitForFences({ fence }, true, uint64_t(-1)); @@ -1203,24 +1144,6 @@ void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k) { ggml_vk_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &d_Y, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT); ggml_vk_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_D, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT); - vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, 3); - vk::DescriptorPoolCreateInfo descriptor_pool_create_info(vk::DescriptorPoolCreateFlags(), 1, descriptor_pool_size); - vk::DescriptorPool descriptor_pool = vk_device.createDescriptorPool(descriptor_pool_create_info); - - vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(descriptor_pool, 1, &vk_pipeline_matmul_f32.dsl); - const std::vector descriptor_sets = vk_device.allocateDescriptorSets(descriptor_set_alloc_info); - vk::DescriptorSet descriptor_set = descriptor_sets.front(); - vk::DescriptorBufferInfo d_X_buffer_info(d_X.buffer, 0, sizeof(ggml_fp16_t) * x_ne); - vk::DescriptorBufferInfo d_Y_buffer_info(d_Y.buffer, 0, sizeof(ggml_fp16_t) * y_ne); - vk::DescriptorBufferInfo d_D_buffer_info(d_D.buffer, 0, sizeof(ggml_fp16_t) * d_ne); - - const std::vector write_descriptor_sets = { - {descriptor_set, 0, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_X_buffer_info}, - {descriptor_set, 1, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_Y_buffer_info}, - {descriptor_set, 2, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_D_buffer_info}, - }; - vk_device.updateDescriptorSets(write_descriptor_sets, {}); - std::array push_constants = { (int)m, (int)n, (int)k, (int)k, (int)k, (int)m }; assert( ( sizeof( push_constants ) <= vk_physical_device.getProperties().limits.maxPushConstantsSize ) && "Too many push constants" ); @@ -1238,36 +1161,12 @@ void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k) { ggml_vk_buffer_write(&d_X, 0, x, sizeof(ggml_fp16_t) * x_ne); ggml_vk_buffer_write(&d_Y, 0, y, sizeof(ggml_fp16_t) * y_ne); - vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_command_pool_compute); - vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo()); + vk::Fence fence = vk_device.createFence(vk::FenceCreateFlags{}); - // compute auto begin = std::chrono::high_resolution_clock::now(); - vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); - cmd_buffer.begin(cmd_buffer_begin_info); - cmd_buffer.pushConstants(vk_pipeline_matmul_f32.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constants); - cmd_buffer.bindPipeline(vk::PipelineBindPoint::eCompute, vk_pipeline_matmul_f32.pipeline); - cmd_buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, - vk_pipeline_matmul_f32.layout, - 0, - { descriptor_set }, - {}); - cmd_buffer.dispatch(CEIL_DIV(m, 32), CEIL_DIV(n, 32), 1); - cmd_buffer.end(); + ggml_vk_dispatch_pipeline(vk_pipeline_matmul_f16, {&d_X, &d_Y, &d_D}, { (int)m, (int)n, (int)k, (int)k, (int)k, (int)m }, { (uint32_t)m, (uint32_t)n, 1}, fence); - vk::Queue queue = vk_device.getQueue(vk_compute_queue_family_index, 0); - - vk::SubmitInfo submit_info(0, - nullptr, - nullptr, - 1, - &cmd_buffer); - - // Wait for transfers to finish - vk_device.getQueue(vk_transfer_queue_family_index, 0).waitIdle(); - - queue.submit({ submit_info }, fence); vk_device.waitForFences({ fence }, true, uint64_t(-1)); diff --git a/vk_shaders/f16_to_f32.glsl b/vk_shaders/f16_to_f32.glsl new file mode 100644 index 000000000..f9ce15c40 --- /dev/null +++ b/vk_shaders/f16_to_f32.glsl @@ -0,0 +1,21 @@ +#version 450 + +#extension GL_EXT_shader_16bit_storage : require + +layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer A { float16_t data_a[]; }; +layout (binding = 1) writeonly buffer D { float data_b[]; }; + +layout (push_constant) uniform parameter +{ + int N; +} p; + +void main() { + const int idx = int(gl_LocalInvocationID.x); + + if (idx < p.N) { + data_b[idx] = float(data_a[idx]); + } +} diff --git a/vk_shaders/matmul_f16.glsl b/vk_shaders/matmul_f16.glsl new file mode 100644 index 000000000..658ab93ed --- /dev/null +++ b/vk_shaders/matmul_f16.glsl @@ -0,0 +1,100 @@ +#version 450 + +#define BM 128 +#define BN 128 +#define BK 8 +#define TM 8 +#define TN 8 + +#extension GL_EXT_control_flow_attributes : enable +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require + +layout(local_size_x = (BM * BN) / (TM * TN), local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer A { float16_t data_a[]; }; +layout (binding = 1) readonly buffer B { float16_t data_b[]; }; +layout (binding = 2) writeonly buffer D { float16_t data_d[]; }; + +layout (push_constant) uniform parameter +{ + int M; + int N; + int K; + int stride_a; + int stride_b; + int stride_d; +} p; + +shared float16_t buf_a[BM * (BK+1)]; +shared float16_t buf_b[BN * (BK+1)]; + +void main() { + const int ir = int(gl_WorkGroupID.x); + const int ic = int(gl_WorkGroupID.y); + + const int rstride = BM / TM; + + const int lr = int(gl_LocalInvocationID.x % rstride); + const int lc = int(gl_LocalInvocationID.x / rstride); + + const int loadr = int(gl_LocalInvocationID.x % BK); + const int loadc = int(gl_LocalInvocationID.x / BK); + + const int loadstride = int(gl_WorkGroupSize.x); + + int pos_a = ir * BM * p.stride_a; + int pos_b = ic * BN * p.stride_b; + + float16_t sums[TM * TN]; + float16_t cache_a[TM]; + float16_t cache_b[TN]; + + [[unroll]] for (int i = 0; i < TM*TN; i++) { + sums[i] = 0.0hf; + } + + [[unroll]] for (int block = 0; block < p.K; block += BK) { + [[unroll]] for (int l = 0; l < BM * BK; l += loadstride) { + const int lr = l % BK; + const int lc = l / BK; + buf_a[(loadc + lc) * (BK+1) + loadr + lr] = data_a[pos_a + (loadc + lc) * p.stride_a + loadr + lr]; + } + [[unroll]] for (int l = 0; l < BN * BK; l += loadstride) { + const int lr = l % BK; + const int lc = l / BK; + buf_b[(loadc + lc) * (BK+1) + loadr + lr] = data_b[pos_b + (loadc + lc) * p.stride_b + loadr + lr]; + } + + barrier(); + + pos_a += BK; + pos_b += BK; + + [[unroll]] for (int i = 0; i < BK; i++) { + // Load from shared into cache + [[unroll]] for (int j = 0; j < BM; j++) { + cache_a[j] = buf_a[(lr + j*rstride) * (BK+1) + i]; + } + [[unroll]] for (int j = 0; j < TN; j++) { + cache_b[j] = buf_b[(lc * TN + j) * (BK+1) + i]; + } + + [[unroll]] for (int cc = 0; cc < TN; cc++) { + [[unroll]] for (int cr = 0; cr < TM; cr++) { + sums[cc * TM + cr] += cache_a[cr] * cache_b[cc]; + } + } + } + + barrier(); + } + + const int dr = ir * BM + lr; + const int dc = ic * BN + lc * TN; + + [[unroll]] for (int cc = 0; cc < TN; cc++) { + [[unroll]] for (int cr = 0; cr < TM; cr++) { + data_d[(dc + cc) * p.stride_d + dr + cr*rstride] = sums[cc * TM + cr]; + } + } +} diff --git a/vk_shaders/matmul_f16.spv b/vk_shaders/matmul_f16.spv deleted file mode 100644 index a52c8f676765148d8f96ce795b26684721d894ac..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2144 zcmZvcSx;0!5QQ6%MRvlbvUmq@#Sk=#D57A%0l^g$BWeUgR5bdck3RU|AM(e!CMJI0 zOc%Kic54sGx!w|I-*N+>%AQ~N3vA8}b%-~#W4POYOD2VFaLu&$H=MSUR>9BBy;LI&$sEYehh29eB`I_?PZAF zOk;g#=JaQPBWmaHJ)<#deP`B;<1b*1UE>bkzK&|Swj zR`+(UlT+7tBXgm9huH7M{j3r3A)e!cG=;2OSz#jNXD%YP>JPW+FNi#dMZ&T);HGl(xAbB6Hc zoa6nDgAp(aoTKh>ApeaT-&O%!Oa3R_PGB8>s4uV;Am<+RL;D22+P<04K8Y_M+NbdC zN&YwULr42Pwbs@@jb8xI$9mVWkF!{ZeOPk_$k~U!db2s!s^=ZQ1-?~l9)t71oO|B~ zo`tz5z+CL-GQRuqUB`ar@#SMbSMb%4|3P0DuxdE$)4Z{~eU|l?;MDiMhyF7D68OsQ z?e8Y&1M*)ow~~3+U8ZUkdkgp-xfk>5x%VnC7kj#k@0xxKvF1H|`B?KlzIP-aYd*x6 zFR!^p>^J6`=3~uA_~ttKkFp2Pya9Ly_G8{V32o0^-6>GML+hjO9en%t+l#(;@pnM< o{Sx24^E)S=CAX%