vulkan: im2col and matmul optimizations for stable diffusion (#10942)
* tests: Add im2col perf tests * vulkan: optimize im2col, more elements per thread * vulkan: increase small tile size for NV_coopmat2 * vulkan: change im2col to 512 elements per workgroup
This commit is contained in:
parent
fdd2188912
commit
a813badbbd
3 changed files with 66 additions and 29 deletions
|
@ -1404,10 +1404,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
// spec constants and tile sizes for non-quant matmul/matmul_id
|
||||
l_warptile = { 256, 128, 256, 64 };
|
||||
m_warptile = { 256, 128, 128, 64 };
|
||||
s_warptile = { 128, 32, 16, 64 };
|
||||
s_warptile = { 128, 64, 64, 64 };
|
||||
l_wg_denoms = {128, 256, 1 };
|
||||
m_wg_denoms = {128, 128, 1 };
|
||||
s_wg_denoms = { 32, 16, 1 };
|
||||
s_wg_denoms = { 64, 64, 1 };
|
||||
|
||||
// spec constants and tile sizes for quant matmul (non-Qi_K)
|
||||
l_warptile_mmq = { 256, 128, 256, 64 };
|
||||
|
@ -2017,11 +2017,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
|
||||
if (device->float_controls_rte_fp16) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte_len, im2col_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte_len, im2col_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
|
||||
} else {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
|
||||
}
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue