From 1947c1200e1f5fd40ca8606454fe097efd3505aa Mon Sep 17 00:00:00 2001 From: arthw <14088817+arthw@users.noreply.github.com> Date: Thu, 1 Aug 2024 11:21:16 +0800 Subject: [PATCH] support set main gpu --- ggml/include/ggml-sycl.h | 2 + ggml/src/ggml-sycl.cpp | 8 +++ ggml/src/ggml-sycl/common.cpp | 8 +-- ggml/src/ggml-sycl/common.hpp | 10 +--- ggml/src/ggml-sycl/sycl_device.cpp | 86 +++++++++++++++++++++++++----- ggml/src/ggml-sycl/sycl_device.hpp | 22 ++++++-- src/llama.cpp | 7 ++- 7 files changed, 112 insertions(+), 31 deletions(-) diff --git a/ggml/include/ggml-sycl.h b/ggml/include/ggml-sycl.h index a50086afa..4368e27a5 100644 --- a/ggml/include/ggml-sycl.h +++ b/ggml/include/ggml-sycl.h @@ -38,6 +38,8 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id); GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int index); GGML_API GGML_CALL void ggml_sycl_set_single_device(int main_gpu_id); +GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id); + // SYCL doesn't support registering host memory, keep here for reference // GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size); // GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer); diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 8a0b8ff63..77c7f610f 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -38,6 +38,7 @@ #include "ggml-sycl/backend.hpp" #include "ggml-sycl/presets.hpp" +#include "ggml-sycl/sycl_device.hpp" void ggml_sycl_free_data(struct ggml_tensor * tensor); @@ -5150,6 +5151,13 @@ GGML_CALL int ggml_backend_sycl_get_device_count() { return ggml_sycl_info().device_count; } +GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) { + + GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_single_device_mode\n"); + fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id); + ggml_sycl_info(main_gpu_id); +} + GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) { ggml_backend_t sycl_backend = ggml_backend_sycl_init((int) (intptr_t) user_data); return sycl_backend; diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index efd702bf1..da7d8c60e 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -137,7 +137,7 @@ void ggml_backend_sycl_print_sycl_devices() { } } -static ggml_sycl_device_info ggml_sycl_init() try { +static ggml_sycl_device_info ggml_sycl_init(int main_gpu_id) try { static bool initialized = false; if (!initialized) { @@ -176,7 +176,7 @@ static ggml_sycl_device_info ggml_sycl_init() try { initialized = true; } - static ggml_sycl_device_info info; + static ggml_sycl_device_info info(main_gpu_id); if (info.device_count == 0) { fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": no available device found\n", @@ -192,8 +192,8 @@ static ggml_sycl_device_info ggml_sycl_init() try { std::exit(1); } -ggml_sycl_device_info &ggml_sycl_info() { - static ggml_sycl_device_info info = ggml_sycl_init(); +ggml_sycl_device_info &ggml_sycl_info(int main_gpu_id) { + static ggml_sycl_device_info info = ggml_sycl_init(main_gpu_id); return info; } diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index f0c93a52d..5b60835a2 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -81,12 +81,6 @@ static int g_ggml_sycl_debug = 0; typedef sycl::queue *queue_ptr; -enum ggml_sycl_backend_gpu_mode { - SYCL_UNSET_GPU_MODE = -1, - SYCL_SINGLE_GPU_MODE = 0, - SYCL_MUL_GPU_MODE -}; - static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size"); static void crash() { @@ -288,8 +282,8 @@ void* ggml_sycl_host_malloc(size_t size); void ggml_sycl_host_free(void* ptr); void ggml_backend_sycl_print_sycl_devices(); -static ggml_sycl_device_info ggml_sycl_init(); -ggml_sycl_device_info &ggml_sycl_info(); +static ggml_sycl_device_info ggml_sycl_init(int main_gpu_id); +ggml_sycl_device_info &ggml_sycl_info(int main_gpu_id = -1); // common device functions diff --git a/ggml/src/ggml-sycl/sycl_device.cpp b/ggml/src/ggml-sycl/sycl_device.cpp index e2633d729..fd8c04599 100644 --- a/ggml/src/ggml-sycl/sycl_device.cpp +++ b/ggml/src/ggml-sycl/sycl_device.cpp @@ -7,22 +7,54 @@ void ggml_sycl_device_info::init( switch (device_filter) { case SYCL_DEVICES_TOP_LEVEL_ZERO: detect_sycl_gpu_list_with_max_cu(); - create_context_for_devices(); break; case SYCL_ALL_DEVICES: detect_all_sycl_device_list(); - create_context_for_devices(); break; case SYCL_VISIBLE_DEVICES: detect_sycl_visible_device_list(); - create_context_for_devices(); break; default: std::cerr << "ggml_sycl_device_info: Invalid device_filter " << device_filter << std::endl; } - init_allow_devices(); + init_devices_dynamic_info(); + m_device_filter = device_filter; +} + +void ggml_sycl_device_info::clear_device_infos() { + ids.clear(); + devices.clear(); + + for (int id=0;id default_tensor_split = {}; - ggml_sycl_device_info(); - void init(ggml_sycl_backend_device_filter device_filter); + ggml_sycl_device_info(int main_gpu_id);//single device mode + void init(ggml_sycl_backend_device_filter device_filter); + void init_single_mode(int main_gpu_id); + + void clear_device_infos(); void print_gpu_device_list(); int work_group_size(int device_id); bool is_allowed_device(int device_id); @@ -64,15 +76,19 @@ struct ggml_sycl_device_info { sycl::queue *create_queue_for_device_id(int device_id); int get_device_index(int device_id); void create_context_for_devices(); - void init_allow_devices(); + void set_allow_devices(); void detect_all_sycl_device_list(); void detect_sycl_visible_device_list(); void detect_sycl_gpu_list_with_max_cu(); int get_device_count(); bool is_ext_oneapi_device(const sycl::device &dev); void add_device_info(int id); + void create_queues(int id); + void create_queues_for_devices(); std::vector get_devices(); std::vector get_sycl_visible_devices(); + void update_mem(); + void init_devices_dynamic_info(); sycl::context &get_co_ctx() { return co_ctx; } diff --git a/src/llama.cpp b/src/llama.cpp index 7bb2dfd46..c2a914add 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2831,8 +2831,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_ #elif defined(GGML_USE_VULKAN) buft = ggml_backend_vk_buffer_type(gpu); #elif defined(GGML_USE_SYCL) - int gpu_id = ggml_backend_sycl_get_device_id(gpu); - buft = ggml_backend_sycl_buffer_type(gpu_id); + buft = ggml_backend_sycl_buffer_type(gpu); #elif defined(GGML_USE_KOMPUTE) buft = ggml_backend_kompute_buffer_type(gpu); if (buft == nullptr) { @@ -5931,6 +5930,10 @@ static bool llm_load_tensors( model.buft_output = llama_default_buffer_type_cpu(true); } } else { + +#if defined(GGML_USE_SYCL) + ggml_backend_sycl_set_single_device_mode(main_gpu); +#endif ggml_backend_buffer_type_t split_buft; if (split_mode == LLAMA_SPLIT_MODE_ROW) { split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);