vulkan: Optimize soft_max (#10301)
* vulkan: Optimize soft_max Large soft_max could already saturate memory, but small/medium sizes were pretty slow. The bulk of the gains for them comes from using a smaller workgroup size, and making the workgroup size match the subgroup size also makes the barriers much cheaper. Cache some values in locals to avoid refetching/recomputing. And stamp out a few "template instantiations" so smaller cases will fully unroll. Add a missing early return for OOB rows. This happens when there are more than 512 rows and the dispatch is 512 x H. * vulkan: Further soft_max optimizations Restore the workgroup size of 512 case, use it for >1024. Use unrollable loops for more iteration counts.
This commit is contained in:
parent
557924f222
commit
b3e585988f
3 changed files with 107 additions and 28 deletions
|
@ -3823,6 +3823,14 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
|||
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1}));
|
||||
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, 1.0f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, 1.0f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {1024, 1024, 10, 1}, false, 1.0f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 1024, 10, 1}, false, 1.0f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {256, 256, 20, 1}, false, 1.0f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {64, 64, 20, 1}, false, 1.0f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 64, 20, 1}, false, 1.0f, 0.0f));
|
||||
|
||||
for (int bs : {1, 512}) {
|
||||
for (ggml_type type_a : all_types) {
|
||||
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue