vulkan : implement Stable Diffusion operators (ggml/904)

* Fix Vulkan repeat op

* Implement Vulkan concat op

* Delete old Vulkan shader generator

* Implement Vulkan im2col op

* Implement Vulkan unary gelu_quick op

* Implement Vulkan group_norm op

* Implement Vulkan timestep_embedding op

* Implement Vulkan upscale op

* Fix Vulkan vk_context tensor extra index issue

* Fix Vulkan matmul shader parameter bug

* Properly fix Vulkan matmul shader parameter bug

* Add Vulkan ADD f16 + f32 -> f16 operator support

* Implement Vulkan tanh op

* Fix Vulkan group count too large Validation error on non-Nvidia GPUs

* Throw error when too much memory is requested

* Fix another Vulkan group count too large Validation error on non-Nvidia GPUs

* Fix matmul MMQ condition

* Implement Vulkan pad op

* Fix Vulkan crash when tensor is used multiple times in a compute graph

* Add Vulkan CONCAT f16 + f16 -> f16 op

* Add Vulkan LEAKY_RELU op
This commit is contained in:
0cc4m 2024-08-04 17:28:08 +02:00 committed by Georgi Gerganov
parent 655858ace0
commit a3738b2fa7
28 changed files with 1032 additions and 293 deletions

View file

@ -30,20 +30,6 @@
#define ASYNCIO_CONCURRENCY 64
// define prototypes
void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str);
bool directory_exists(const std::string& path);
bool create_directory(const std::string& path);
std::string to_uppercase(const std::string& input);
bool string_ends_with(const std::string& str, const std::string& suffix);
std::string join_paths(const std::string& path1, const std::string& path2);
std::string basename(const std::string &path);
void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16);
std::map<std::string, std::string> merge_maps(const std::map<std::string, std::string>& a, const std::map<std::string, std::string>& b);
void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmul_id);
void process_shaders(std::vector<std::future<void>>& tasks);
void write_output_files();
std::mutex lock;
std::vector<std::pair<std::string, std::string>> shader_fnames;
@ -52,7 +38,7 @@ std::string input_dir = "vulkan-shaders";
std::string output_dir = "/tmp";
std::string target_hpp = "ggml-vulkan-shaders.hpp";
std::string target_cpp = "ggml-vulkan-shaders.cpp";
bool clean = true;
bool no_clean = false;
const std::vector<std::string> type_names = {
"f32",
@ -283,9 +269,12 @@ void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmu
for (const auto& tname : type_names) {
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
// For unaligned, load one at a time for f32/f16, or two at a time for quants
std::string load_vec_a_unaligned = (tname == "f32" || tname == "f16") ? "1" : "2";
// For aligned matmul loads
std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2";
tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
}));
tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16);
@ -354,6 +343,9 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
}));
tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
}));
tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
}));
@ -371,6 +363,9 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
@ -396,15 +391,42 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
@ -438,6 +460,17 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
}));
tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
}));
tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
}));
tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
}));
}
void write_output_files() {
@ -478,9 +511,8 @@ void write_output_files() {
}
fprintf(src, "\n};\n\n");
if (clean) {
if (!no_clean) {
std::remove(path.c_str());
// fprintf(stderr, "Removed: %s\n", path.c_str());
}
}
@ -496,18 +528,6 @@ int main(int argc, char** argv) {
}
}
if (argc <= 1 || args.find("--help") != args.end()) {
std::cout << "Usage:\n"
"\tvulkan-shaders-gen [options]\n\n"
"Options:\n"
"\t--glslc <path> Path to glslc executable (default: /usr/bin/glslc)\n"
"\t--input-dir Directory containing shader sources (required)\n"
"\t--output-dir Output directory for generated SPIR-V files and optional C++ headers\n"
"\t--target-hpp <path> Path to generate a header file with shader declarations in C++ format\n"
"\t--target-cpp <path> Path to generate a source code file implementing the declared shaders (optional)\n"
"\t--no-clean Keep temporary SPIR-V files after build (default: remove them)\n";
return EXIT_SUCCESS;
}
if (args.find("--glslc") != args.end()) {
GLSLC = args["--glslc"]; // Path to glslc
}
@ -524,7 +544,7 @@ int main(int argc, char** argv) {
target_cpp = args["--target-cpp"]; // Path to generated cpp file
}
if (args.find("--no-clean") != args.end()) {
clean = false; // Keep temporary SPIR-V files in output-dir after build
no_clean = true; // Keep temporary SPIR-V files in output-dir after build
}
if (!directory_exists(input_dir)) {