Vulkan Mixture of Experts (MoE) support (#7628)

* Finish Vulkan mul_mat_id implementation

* Add Vulkan sum_rows and div ops

* Fix MUL_MAT_ID matrix matrix shader

* Fix MUL_MAT_ID matrix vector shader dispatch size

* Fix MUL_MAT_ID matrix vector shader and dispatch code

* Update Vulkan CPU offload for MUL_MAT_ID

* Fix crash when using split mode none and setting a main GPU
This commit is contained in:
0cc4m 2024-06-03 10:59:14 +02:00 committed by GitHub
parent a10cda58d3
commit 3d7ebf6312
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 73389 additions and 13839 deletions

View file

@ -1002,9 +1002,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
params.main_gpu = std::stoi(argv[i]);
#ifndef GGML_USE_CUDA_SYCL
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n");
#endif // GGML_USE_CUDA_SYCL
#ifndef GGML_USE_CUDA_SYCL_VULKAN
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
#endif // GGML_USE_CUDA_SYCL_VULKAN
return true;
}
if (arg == "--split-mode" || arg == "-sm") {
@ -1030,9 +1030,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
invalid_param = true;
return true;
}
#ifndef GGML_USE_CUDA_SYCL
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n");
#endif // GGML_USE_CUDA_SYCL
#ifndef GGML_USE_CUDA_SYCL_VULKAN
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
#endif // GGML_USE_CUDA_SYCL_VULKAN
return true;
}
if (arg == "--tensor-split" || arg == "-ts") {