From de2763118fd5b6ea89702cc9981349e0556b0c3d Mon Sep 17 00:00:00 2001 From: Jianyu Zhang Date: Wed, 19 Jun 2024 22:54:15 +0800 Subject: [PATCH] fix to support multiple GPUs, fix set single device, unify id/device_id/device_index --- ggml/include/ggml-sycl.h | 6 + ggml/src/ggml-sycl.cpp | 434 ++++++++++++++++++---------------- ggml/src/ggml-sycl/common.hpp | 233 ++++++++++++++++-- src/llama.cpp | 1 + 4 files changed, 453 insertions(+), 221 deletions(-) diff --git a/ggml/include/ggml-sycl.h b/ggml/include/ggml-sycl.h index 43ab1519c..652de9520 100644 --- a/ggml/include/ggml-sycl.h +++ b/ggml/include/ggml-sycl.h @@ -34,6 +34,12 @@ GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *des GGML_API GGML_CALL int ggml_backend_sycl_get_device_count(); GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); +GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id); +GGML_API GGML_CALL void ggml_sycl_set_single_device(int main_gpu_id); + +// GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id); +// GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode(); + // SYCL doesn't support registering host memory, keep here for reference // GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size); // GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer); diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 76bad57e2..68ebe8bf9 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -51,6 +51,90 @@ static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer); static inline int get_sycl_env(const char *env_name, int default_val); static inline int get_work_group_size(const sycl::device& device); +static bool g_sycl_loaded = false; + +bool ggml_sycl_loaded(void) { + return g_sycl_loaded; +} + +int get_sycl_env(const char *env_name, int default_val) { + char *user_device_string = getenv(env_name); + int user_number = default_val; + + unsigned n; + if (user_device_string != NULL && + sscanf(user_device_string, " %u", &n) == 1) { + user_number = (int)n; + } else { + user_number = default_val; + } + return user_number; +} + +static ggml_sycl_device_info ggml_sycl_init() try { + static bool initialized = false; + + if (!initialized) { + fprintf(stderr, "[SYCL] call ggml_init_sycl\n"); + + g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); + fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, + g_ggml_sycl_debug); + +#if defined(GGML_SYCL_F16) + fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__); +#else + fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__); +#endif + +#if defined(GGML_SYCL_FORCE_MMQ) + fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: yes\n", __func__); +#else + fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: no\n", __func__); +#endif + +#if defined(SYCL_USE_XMX) + fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__); +#else + fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__); +#endif + + if (CHECK_TRY_ERROR(g_all_sycl_device_count = + dpct::dev_mgr::instance().device_count()) != + 0) { + initialized = true; + g_sycl_loaded = false; + return; + } + GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES); + ggml_backend_sycl_print_sycl_devices(); + initialized = true; + g_sycl_loaded = true; + } + + static ggml_sycl_device_info info = {}; + info.refresh_device(SYCL_MUL_GPU_MODE); + + if (info.device_count == 0) { + fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": %s\n", + __func__); + return info; + } + GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES); + + return info; +} catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + + +ggml_sycl_device_info &ggml_sycl_info() { + static ggml_sycl_device_info info = ggml_sycl_init(); + return info; +} + void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, const void *ptr_src, size_t size) { char *host_buf = (char *)malloc(size); @@ -2016,12 +2100,6 @@ static void im2col_sycl(const float *x, T *dst, int IW, int IH, } -static bool g_sycl_loaded = false; - -bool ggml_sycl_loaded(void) { - return g_sycl_loaded; -} - void print_device_detail(int id, sycl::device &device, std::string device_type) { dpct::device_info prop; @@ -2121,67 +2199,20 @@ catch (sycl::exception const &exc) { std::exit(1); } -static ggml_sycl_device_info ggml_sycl_init() { - ggml_sycl_device_info info = {}; - - info.device_count = dpct::dev_mgr::instance().device_count(); - if (info.device_count == 0) { - fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__); - return info; - } - - GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES); - - int64_t total_vram = 0; -#if defined(GGML_SYCL_FORCE_MMQ) - fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: yes\n", __func__); -#else - fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: no\n", __func__); -#endif -#if defined(SYCL_USE_XMX) - fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__); -#else - fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__); -#endif - fprintf(stderr, "%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count); - - for (int i = 0; i < info.device_count; ++i) { - info.devices[i].vmm = 0; - dpct::device_info prop; - SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( - prop, dpct::dev_mgr::instance().get_device(i)))); - - info.default_tensor_split[i] = total_vram; - total_vram += prop.get_global_mem_size(); - - info.devices[i].cc = - 100 * prop.get_major_version() + 10 * prop.get_minor_version(); - } - - for (int id = 0; id < info.device_count; ++id) { - info.default_tensor_split[id] /= total_vram; - } - return info; +GGML_API GGML_CALL void ggml_sycl_set_single_device(int device_id) { + ggml_sycl_info().refresh_device(SYCL_SINGLE_GPU_MODE, device_id); + ggml_sycl_set_main_device(device_id); } -const ggml_sycl_device_info & ggml_sycl_info() { - static ggml_sycl_device_info info = ggml_sycl_init(); - return info; -} - -/* -device_index: device index from 0 to n (continue numbers). - It is used for device select/set in SYCL backend internal data structure. -*/ -inline void check_allow_gpu_index(const int device_index) { - if (device_index >= ggml_sycl_info().device_count) { +inline void check_allow_device_id(const int device_id) { + if (device_id >= ggml_sycl_info().device_count) { char error_buf[256]; snprintf( error_buf, sizeof(error_buf), - "%s error: device_index:%d is out of range: [0-%d]", + "%s error: device_id:%d is out of range: [0-%d]", __func__, - device_index, + device_id, ggml_sycl_info().device_count - 1); fprintf(stderr, "%s\n", error_buf); assert(false); @@ -2192,7 +2223,7 @@ inline void check_allow_gpu_index(const int device_index) { struct ggml_sycl_pool_leg : public ggml_sycl_pool { static const int MAX_SYCL_BUFFERS = 256; - int device; + int device_id; queue_ptr qptr; struct ggml_sycl_buffer { void * ptr = nullptr; @@ -2204,7 +2235,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool { explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : qptr(qptr_), - device(device_) { + device_id(device_) { } ~ggml_sycl_pool_leg() { @@ -2288,12 +2319,12 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool { } }; -std::unique_ptr ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) { +std::unique_ptr ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device_id) { // TBD: NO VMM support - // if (ggml_sycl_info().devices[device].vmm) { - // return std::unique_ptr(new ggml_sycl_pool_vmm(device)); + // if (ggml_sycl_info().devices[device_id].vmm) { + // return std::unique_ptr(new ggml_sycl_pool_vmm(device_id)); // } - return std::unique_ptr(new ggml_sycl_pool_leg(qptr, device)); + return std::unique_ptr(new ggml_sycl_pool_leg(qptr, device_id)); } // TBD pool with virtual memory management @@ -2704,13 +2735,13 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor static int64_t get_row_rounding(ggml_type type, const std::array & tensor_split) { int64_t min_compute_capability = INT_MAX; int64_t max_compute_capability = INT_MIN; - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - if (tensor_split[i] < (i + 1 < ggml_sycl_info().device_count ? tensor_split[i + 1] : 1.0f)) { - if (min_compute_capability > ggml_sycl_info().devices[i].cc) { - min_compute_capability = ggml_sycl_info().devices[i].cc; + for (int id = 0; id < ggml_sycl_info().device_count; ++id) { + if (tensor_split[id] < (id + 1 < ggml_sycl_info().device_count ? tensor_split[id + 1] : 1.0f)) { + if (min_compute_capability > ggml_sycl_info().devices[id].cc) { + min_compute_capability = ggml_sycl_info().devices[id].cc; } - if (max_compute_capability < ggml_sycl_info().devices[i].cc) { - max_compute_capability = ggml_sycl_info().devices[i].cc; + if (max_compute_capability < ggml_sycl_info().devices[id].cc) { + max_compute_capability = ggml_sycl_info().devices[id].cc; } } } @@ -3132,18 +3163,18 @@ static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) { } #ifdef NDEBUG - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - SYCL_CHECK(ggml_sycl_set_device(i)); + for (int id = 0; id < ggml_sycl_info().device_count; ++id) { + SYCL_CHECK(ggml_sycl_set_device(id)); } - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - SYCL_CHECK(ggml_sycl_set_device(i)); + for (int id = 0; id < ggml_sycl_info().device_count; ++id) { + SYCL_CHECK(ggml_sycl_set_device(id)); for (int id_other = 0; id_other < ggml_sycl_info().device_count; ++id_other) { - if (i == id_other) { + if (id == id_other) { continue; } - if (i != main_device && id_other != main_device) { + if (id != main_device && id_other != main_device) { continue; } @@ -3240,62 +3271,62 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten int used_devices = 0; queue_ptr main_stream = ctx.stream(); - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { + for (int id = 0; id < ggml_sycl_info().device_count; ++id) { // by default, use all rows - dev[i].row_low = 0; - dev[i].row_high = ne01; + dev[id].row_low = 0; + dev[id].row_high = ne01; // for multi GPU, get the row boundaries from tensor split // and round to mul_mat_q tile sizes if (split) { const int64_t rounding = get_row_rounding(src0->type, tensor_split); - if (i != 0) { - dev[i].row_low = ne01*tensor_split[i]; - if (dev[i].row_low < ne01) { - dev[i].row_low -= dev[i].row_low % rounding; + if (id != 0) { + dev[id].row_low = ne01*tensor_split[id]; + if (dev[id].row_low < ne01) { + dev[id].row_low -= dev[id].row_low % rounding; } } - if (i != ggml_sycl_info().device_count - 1) { - dev[i].row_high = ne01*tensor_split[i + 1]; - if (dev[i].row_high < ne01) { - dev[i].row_high -= dev[i].row_high % rounding; + if (id != ggml_sycl_info().device_count - 1) { + dev[id].row_high = ne01*tensor_split[id + 1]; + if (dev[id].row_high < ne01) { + dev[id].row_high -= dev[id].row_high % rounding; } } } } - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) { + for (int id = 0; id < ggml_sycl_info().device_count; ++id) { + if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) { continue; } used_devices++; - const bool src1_on_device = i == ctx.device; - const bool dst_on_device = i == ctx.device; + const bool src1_on_device = id == ctx.device; + const bool dst_on_device = id == ctx.device; - ggml_sycl_set_device(i); - queue_ptr stream = ctx.stream(i, 0); + ggml_sycl_set_device(id); + queue_ptr stream = ctx.stream(id, 0); if (src0_is_contiguous) { - dev[i].src0_dd = (char *) src0->data; + dev[id].src0_dd = (char *) src0->data; } else { - dev[i].src0_dd = dev[i].src0_dd_alloc.alloc(ctx.pool(i), ggml_nbytes(src0)); + dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0)); } if (src1_on_device && src1_is_contiguous) { - dev[i].src1_ddf = (float *) src1->data; + dev[id].src1_ddf = (float *) src1->data; } else { - dev[i].src1_ddf = dev[i].src1_ddf_alloc.alloc(ctx.pool(i), ggml_nelements(src1)); + dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1)); } if (convert_src1_to_q8_1) { - dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs); + dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs); if (src1_on_device && src1_is_contiguous) { - quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream); + quantize_row_q8_1_sycl(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream); /* DPCT1010:90: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to @@ -3306,10 +3337,10 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten } if (dst_on_device) { - dev[i].dst_dd = (float *) dst->data; + dev[id].dst_dd = (float *) dst->data; } else { - const size_t size_dst_ddf = split ? (dev[i].row_high - dev[i].row_low)*ne1 : ggml_nelements(dst); - dev[i].dst_dd = dev[i].dst_dd_alloc.alloc(ctx.pool(i), size_dst_ddf); + const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst); + dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(ctx.pool(id), size_dst_ddf); } } @@ -3332,20 +3363,20 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_SYCL_MAX_STREAMS : 0; const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride; - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) { + for (int id = 0; id < ggml_sycl_info().device_count; ++id) { + if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) { continue; } - const bool src1_on_device = i == ctx.device; - const bool dst_on_device = i == ctx.device; - const int64_t row_diff = dev[i].row_high - dev[i].row_low; + const bool src1_on_device = id == ctx.device; + const bool dst_on_device = id == ctx.device; + const int64_t row_diff = dev[id].row_high - dev[id].row_low; - ggml_sycl_set_device(i); - queue_ptr stream = ctx.stream(i, is); + ggml_sycl_set_device(id); + queue_ptr stream = ctx.stream(id, is); // wait for main GPU data if necessary - if (split && (i != ctx.device || is != 0)) { + if (split && (id != ctx.device || is != 0)) { /* DPCT1009:163: SYCL uses exceptions to report errors and does not use the error codes. The original code was commented out and a @@ -3362,20 +3393,20 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs; // for split tensors the data begins at i0 == i0_offset_low - char * src0_dd_i = dev[i].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs; - float * src1_ddf_i = dev[i].src1_ddf + (i0*ne11 + src1_col_0) * ne10; - char * src1_ddq_i = dev[i].src1_ddq + src1_ddq_i_offset; - float * dst_dd_i = dev[i].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff); + char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs; + float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10; + char * src1_ddq_i = dev[id].src1_ddq + src1_ddq_i_offset; + float * dst_dd_i = dev[id].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff); // the main device memory buffer can be on VRAM scratch, with space for all partial results // in that case an offset on dst_ddf_i is needed - if (i == ctx.device) { - dst_dd_i += dev[i].row_low; // offset is 0 if no tensor split + if (id == ctx.device) { + dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split } // copy src0, src1 to device if necessary if (src1_is_contiguous) { - if (i != ctx.device) { + if (id != ctx.device) { if (convert_src1_to_q8_1) { char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset; SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy( @@ -3410,14 +3441,14 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten } if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) { - SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[i].row_low, dev[i].row_high, stream)); + SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream)); } if (src1->type == GGML_TYPE_F16) { src1_padded_col_size = (i0 * ne11 + src1_col_0) * ne10; } // do the computation SYCL_CHECK(CHECK_TRY_ERROR(op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i, - dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream))); + dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream))); /* DPCT1010:93: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to @@ -3436,7 +3467,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results. float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); - dhf_dst_i += src1_col_0*ne0 + dev[i].row_low; + dhf_dst_i += src1_col_0*ne0 + dev[id].row_low; SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy( dhf_dst_i, ne0 * sizeof(float), dst_dd_i, @@ -3453,7 +3484,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten } // add event for the main device to wait on until other device is done - if (split && (i != ctx.device || is != 0)) { + if (split && (id != ctx.device || is != 0)) { /* DPCT1024:94: The original code returned the error code that was further consumed by the program logic. This original @@ -3461,7 +3492,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten program logic consuming the error code. */ SYCL_CHECK(CHECK_TRY_ERROR( - *src0_extra->events[i][is] = + *src0_extra->events[id][is] = stream->ext_oneapi_submit_barrier())); } } @@ -3474,14 +3505,14 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten is_max = is_max <= GGML_SYCL_MAX_STREAMS ? is_max : GGML_SYCL_MAX_STREAMS; ggml_sycl_set_device(ctx.device); - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - if (dev[i].row_low == dev[i].row_high) { + for (int id = 0; id < ggml_sycl_info().device_count; ++id) { + if (dev[id].row_low == dev[id].row_high) { continue; } for (int64_t is = 0; is < is_max; ++is) { SYCL_CHECK(CHECK_TRY_ERROR( ctx.stream()->ext_oneapi_submit_barrier( - {*src0_extra->events[i][is]}))); + {*src0_extra->events[id][is]}))); } } } @@ -4310,17 +4341,17 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); } -void ggml_sycl_set_main_device(const int main_device) try { - if (dpct::get_current_device_id() == main_device) return; - check_allow_gpu_index(main_device); - dpct::select_device(main_device); +void ggml_sycl_set_main_device(const int main_device_id) try { + if (dpct::get_current_device_id() == main_device_id) return; + check_allow_device_id(main_device_id); + dpct::select_device(main_device_id); if (g_ggml_sycl_debug) { dpct::device_info prop; SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( - prop, dpct::dev_mgr::instance().get_device(main_device)))); + prop, dpct::dev_mgr::instance().get_device(main_device_id)))); fprintf(stderr, "Using device %d (%s) as main device\n", - main_device, prop.get_name()); + main_device_id, prop.get_name()); } } catch (sycl::exception const &exc) { @@ -4475,9 +4506,9 @@ GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n"); for(int i=0;i=max_len) break; - id_list[i] = i; + for (int id=0;id< ggml_sycl_info().device_count;id++){ + if (id>=max_len) break; + id_list[id] = id; } return; } @@ -4501,12 +4532,12 @@ catch (sycl::exception const &exc) { std::exit(1); } -GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, +GGML_API GGML_CALL void ggml_sycl_get_device_description(int device_id, char *description, size_t description_size) try { GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n"); dpct::device_info prop; SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( - prop, dpct::dev_mgr::instance().get_device(device)))); + prop, dpct::dev_mgr::instance().get_device(device_id)))); snprintf(description, description_size, "%s", prop.get_name()); } catch (sycl::exception const &exc) { @@ -4515,10 +4546,10 @@ catch (sycl::exception const &exc) { std::exit(1); } -GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, +GGML_CALL void ggml_backend_sycl_get_device_memory(int device_id, size_t *free, size_t *total) try { GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n"); - ggml_sycl_set_device(device); + ggml_sycl_set_device(device_id); /* DPCT1009:218: SYCL uses exceptions to report errors and does not use the @@ -4531,7 +4562,7 @@ GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, You may need to adjust the code. */ SYCL_CHECK(CHECK_TRY_ERROR( - dpct::dev_mgr::instance().get_device(device).get_memory_info(*free, *total))); + dpct::dev_mgr::instance().get_device(device_id).get_memory_info(*free, *total))); } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ @@ -4553,9 +4584,9 @@ struct ggml_backend_sycl_buffer_context { queue_ptr stream; std::string name; - ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) : - device(device), dev_ptr(dev_ptr), stream(stream) { - check_allow_gpu_index(device); + ggml_backend_sycl_buffer_context(int device_id, void * dev_ptr, queue_ptr stream) : + device(device_id), dev_ptr(dev_ptr), stream(stream) { + check_allow_device_id(device); name = (GGML_SYCL_NAME + std::to_string(device)); } @@ -4831,58 +4862,58 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = { /* .is_host = */ nullptr, }; -ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { +ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) { static std::mutex mutex; std::lock_guard lock(mutex); GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n"); - if (device>=ggml_sycl_info().device_count or device<0) { - printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", - device, ggml_sycl_info().device_count-1); - GGML_ASSERT(device=ggml_sycl_info().device_count or device_id<0) { + printf("ggml_backend_sycl_buffer_type error: device_id:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", + device_id, ggml_sycl_info().device_count-1); + GGML_ASSERT(device_iddevice; - if (device>=ggml_sycl_info().device_count or device<0) { - printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", - device, ggml_sycl_info().device_count-1); - GGML_ASSERT(devicedevice; + if (device_id>=ggml_sycl_info().device_count or device_id<0) { + printf("ggml_backend_sycl_buffer_type error: device_id:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", + device_id, ggml_sycl_info().device_count-1); + GGML_ASSERT(device_idstream(i, 0)}, + /* .context = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), ctx->stream(id, 0)}, }; } ggml_backend_sycl_buffer_type_initialized = true; } - return &ggml_backend_sycl_buffer_types[device]; + return &ggml_backend_sycl_buffer_types[device_id]; } // sycl split buffer type @@ -4903,9 +4934,9 @@ static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tens struct ggml_backend_sycl_split_buffer_context { ~ggml_backend_sycl_split_buffer_context() try { for (ggml_tensor_extra_gpu * extra : tensor_extras) { - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { + for (int id = 0; id < ggml_sycl_info().device_count; ++id) { for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) { - if (extra->events[i][is] != nullptr) { + if (extra->events[id][is] != nullptr) { /* DPCT1009:206: SYCL uses exceptions to report errors and does not use the error codes. The original code was @@ -4913,19 +4944,19 @@ struct ggml_backend_sycl_split_buffer_context { need to rewrite this code. */ SYCL_CHECK(CHECK_TRY_ERROR( - dpct::destroy_event(extra->events[i][is]))); + dpct::destroy_event(extra->events[id][is]))); } } - if (extra->data_device[i] != nullptr) { + if (extra->data_device[id] != nullptr) { /* DPCT1009:207: SYCL uses exceptions to report errors and does not use the error codes. The original code was commented out and a warning string was inserted. You need to rewrite this code. */ - ggml_sycl_set_device(i); + ggml_sycl_set_device(id); SYCL_CHECK(CHECK_TRY_ERROR(sycl::free( - extra->data_device[i], *(streams[i])))); + extra->data_device[id], *(streams[id])))); } } delete extra; @@ -4978,9 +5009,9 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ctx->tensor_extras.push_back(extra); ctx->streams.push_back(&(dpct::get_current_device().default_queue())); - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { + for (int id = 0; id < ggml_sycl_info().device_count; ++id) { int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i); + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); int64_t nrows_split = row_high - row_low; if (nrows_split == 0) { @@ -4997,8 +5028,8 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, // FIXME: do not crash if cudaMalloc fails // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first - ggml_sycl_set_device(i); - const queue_ptr stream = ctx->streams[i]; + ggml_sycl_set_device(id); + const queue_ptr stream = ctx->streams[id]; char * buf; /* DPCT1009:208: SYCL uses exceptions to report errors and does not use the @@ -5021,7 +5052,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, .wait())); } - extra->data_device[i] = buf; + extra->data_device[id] = buf; for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) { /* @@ -5030,7 +5061,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, string was inserted. You need to rewrite this code. */ SYCL_CHECK( - CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event())); + CHECK_TRY_ERROR(extra->events[id][is] = new sycl::event())); } } tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT; @@ -5057,9 +5088,9 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer, const size_t nb1 = tensor->nb[1]; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { + for (int id = 0; id < ggml_sycl_info().device_count; ++id) { int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i); + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); int64_t nrows_split = row_high - row_low; if (nrows_split == 0) { @@ -5081,11 +5112,11 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer, error codes. The original code was commented out and a warning string was inserted. You need to rewrite this code. */ - ggml_sycl_set_device(i); - const queue_ptr stream = ctx->streams[i]; + ggml_sycl_set_device(id); + const queue_ptr stream = ctx->streams[id]; SYCL_CHECK(CHECK_TRY_ERROR( (*stream) - .memcpy(extra->data_device[i], buf_host, original_size) + .memcpy(extra->data_device[id], buf_host, original_size) .wait())); } } @@ -5110,9 +5141,9 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const size_t nb1 = tensor->nb[1]; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { + for (int id = 0; id < ggml_sycl_info().device_count; ++id) { int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i); + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); int64_t nrows_split = row_high - row_low; if (nrows_split == 0) { @@ -5134,11 +5165,11 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer, error codes. The original code was commented out and a warning string was inserted. You need to rewrite this code. */ - ggml_sycl_set_device(i); - const queue_ptr stream = ctx->streams[i]; + ggml_sycl_set_device(id); + const queue_ptr stream = ctx->streams[id]; SYCL_CHECK(CHECK_TRY_ERROR( (*stream) - .memcpy(buf_host, extra->data_device[i], original_size) + .memcpy(buf_host, extra->data_device[id], original_size) .wait())); } } @@ -5193,9 +5224,9 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_ const int64_t ne0 = tensor->ne[0]; - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { + for (int id = 0; id < ggml_sycl_info().device_count; ++id) { int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, i); + get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id); int64_t nrows_split = row_high - row_low; if (nrows_split == 0) { @@ -5244,12 +5275,12 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const f tensor_split_arr = ggml_sycl_info().default_tensor_split; } else { float split_sum = 0.0f; - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - tensor_split_arr[i] = split_sum; - split_sum += tensor_split[i]; + for (int id = 0; id < ggml_sycl_info().device_count; ++id) { + tensor_split_arr[id] = split_sum; + split_sum += tensor_split[id]; } - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - tensor_split_arr[i] /= split_sum; + for (int id = 0; id < ggml_sycl_info().device_count; ++id) { + tensor_split_arr[id] /= split_sum; } } @@ -5613,13 +5644,10 @@ static ggml_guid_t ggml_backend_sycl_guid() { return &guid; } -GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) { - GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n"); - ggml_check_sycl(); - - check_allow_gpu_index(device); - - ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context(device); +GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device_id) { + GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init, device_id=%d\n", device_id); + check_allow_device_id(device_id); + ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context(ggml_sycl_info(), device_id); if (ctx == nullptr) { fprintf(stderr, "%s: error: failed to allocate context\n", __func__); return nullptr; @@ -5654,10 +5682,10 @@ extern "C" int ggml_backend_sycl_reg_devices(); int ggml_backend_sycl_reg_devices() { assert(ggml_sycl_info().device_count>0); - for (int i = 0; i < ggml_sycl_info().device_count; i++) { + for (int id = 0; id < ggml_sycl_info().device_count; id++) { char name[128]; - snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, i); - ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(i), (void *) (intptr_t) i); + snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, id); + ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(id), (void *) (intptr_t) id); } return ggml_sycl_info().device_count; } diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index dfd4a7c2c..518b4e4c1 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -49,7 +49,7 @@ static int g_ggml_sycl_debug = 0; // #define DEBUG_SYCL_MALLOC -static int g_work_group_size = 0; +static int g_work_group_size = -1; // typedef sycl::half ggml_fp16_t; #define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP @@ -89,6 +89,11 @@ enum ggml_sycl_backend_gpu_mode { SYCL_MUL_GPU_MODE }; +enum ggml_sycl_backend_device_filter { + SYCL_DEVICE_FILTER_ALL = 0, + SYCL_DEVICE_FILTER_TOP_LEVEL_ZERO +}; + static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size"); static void crash() { @@ -158,18 +163,18 @@ static size_t g_scratch_offset = 0; int get_current_device_id(); -inline dpct::err0 ggml_sycl_set_device(const int device) try { +inline dpct::err0 ggml_sycl_set_device(const int device_id) try { int current_device_id; SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id())); - // GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d, - // current_device_id=%d\n", device, current_device); - if (device == current_device_id) { + GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d, current_device_id=%d\n", device_id, current_device_id); + if (device_id == current_device_id) { return 0; } - return CHECK_TRY_ERROR(dpct::select_device(device)); + return CHECK_TRY_ERROR(dpct::select_device(device_id)); + } catch (sycl::exception const& exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; @@ -177,25 +182,218 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try { std::exit(1); } -////////////////////// +class sycl_device_mgr { + public: + std::vector device_ids; + std::vector devices; + std::vector max_compute_units; + std::vector work_group_sizes; + sycl::queue *first_queue; + std::vector queues; + std::vector ctxs; + std::string device_list = ""; + + sycl_device_mgr(ggml_sycl_backend_device_filter device_filter) { + if (device_filter == SYCL_DEVICE_FILTER_TOP_LEVEL_ZERO) { + detect_sycl_gpu_list_with_max_cu(); + create_context_for_group_gpus(); + } else { + detect_all_sycl_device_list(); + create_context_queue_for_devices(); + } + get_allow_devices(); + } + + /* + Bind all gpus in same host with same context, for better performance in + device-to-device copy in the future. + */ + void create_context_for_group_gpus() { + sycl::context ctx = sycl::context(devices); + assert(device_ids.size() > 0); + first_queue = dpct::get_current_device().create_queue(ctx, devices[0]); + sycl::context ctx0 = first_queue->get_context(); + for (int i = 0; i < device_ids.size(); i++) { + ctxs.push_back(ctx0); + dpct::select_device(device_ids[i]); + queues.push_back( + dpct::get_current_device().create_queue(ctx0, devices[i])); + } + } + + void create_context_queue_for_devices() { + for (int i = 0; i < device_ids.size(); i++) { + sycl::context ctx = sycl::context(devices[i]); + ctxs.push_back(ctx); + dpct::select_device(device_ids[i]); + queues.push_back( + dpct::get_current_device().create_queue(ctx, devices[i])); + } + } + + void get_allow_devices() { + device_list = ""; + for (size_t i = 0; i < device_ids.size(); ++i) { + device_list += std::to_string(device_ids[i]); + device_list += ","; + } + if (device_list.length() > 1) { + device_list.pop_back(); + } + } + + bool is_allowed_device(int device_id) { + return std::find(device_ids.begin(), device_ids.end(), device_id) != device_ids.end(); + } + + void detect_all_sycl_device_list() try { + int device_count = dpct::dev_mgr::instance().device_count(); + + for (int id = 0; id < device_count; id++) { + sycl::device device = dpct::dev_mgr::instance().get_device(id); + device_ids.push_back(id); + devices.push_back(device); + dpct::device_info prop; + dpct::get_device_info(prop, device); + work_group_sizes.push_back(prop.get_max_work_group_size()); + max_compute_units.push_back(prop.get_max_compute_units()); + } + return; + } catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); + } + + /* + Use all GPUs with same top max compute units + */ + void detect_sycl_gpu_list_with_max_cu() try { + int device_count = dpct::dev_mgr::instance().device_count(); + int local_max_compute_units = 0; + for (int id = 0; id < device_count; id++) { + sycl::device device = dpct::dev_mgr::instance().get_device(id); + if (!device.is_gpu()) + continue; + dpct::device_info prop; + dpct::get_device_info(prop, device); + if (local_max_compute_units < prop.get_max_compute_units()) + local_max_compute_units = prop.get_max_compute_units(); + } + + for (int id = 0; id < device_count; id++) { + sycl::device device = dpct::dev_mgr::instance().get_device(id); + if (!device.is_gpu()) + continue; + dpct::device_info prop; + dpct::get_device_info(prop, device); + if (local_max_compute_units == prop.get_max_compute_units() && + is_ext_oneapi_device(device)) { + device_ids.push_back(id); + devices.push_back(device); + work_group_sizes.push_back(prop.get_max_work_group_size()); + max_compute_units.push_back(prop.get_max_compute_units()); + } + } + return; + } catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); + } + + int get_device_count() { return (int)device_ids.size(); } + + bool is_ext_oneapi_device(const sycl::device &dev) { + sycl::backend dev_backend = dev.get_backend(); + if (dev_backend == sycl::backend::ext_oneapi_level_zero || + dev_backend == sycl::backend::ext_oneapi_cuda || + dev_backend == sycl::backend::ext_oneapi_hip) + return true; + return false; + } +}; struct ggml_sycl_device_info { int device_count; - + int main_gpu_id = -1; + ggml_sycl_backend_gpu_mode use_gpu_mode = SYCL_MUL_GPU_MODE; struct sycl_device_info { - int cc; // compute capability + int cc; // compute capability // int nsm; // number of streaming multiprocessors // size_t smpb; // max. shared memory per block - bool vmm; // virtual memory support - size_t total_vram; + bool vmm; // virtual memory support + size_t total_vram; }; sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {}; std::array default_tensor_split = {}; -}; -const ggml_sycl_device_info & ggml_sycl_info(); + sycl_device_mgr *local_sycl_device_mgr = NULL; + + void print_gpu_device_list() { + GGML_ASSERT(local_sycl_device_mgr); + + char *hint = NULL; + if (use_gpu_mode == SYCL_MUL_GPU_MODE) { + hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n"; + fprintf(stderr, hint, local_sycl_device_mgr->get_device_count(), + local_sycl_device_mgr->device_list.c_str(), + local_sycl_device_mgr->max_compute_units[main_gpu_id]); + } else { + hint = "use main device [%d] with Max compute units:%d\n"; + fprintf(stderr, hint, main_gpu_id, + local_sycl_device_mgr->max_compute_units[main_gpu_id]); + } + } + + int work_group_size(int device_id) { + GGML_ASSERT(local_sycl_device_mgr); + return local_sycl_device_mgr->work_group_sizes[device_id]; + } + + void refresh_device(ggml_sycl_backend_gpu_mode gpu_model, + int p_main_gpu_id = 0) { + main_gpu_id = p_main_gpu_id; + use_gpu_mode = gpu_model; + if (!local_sycl_device_mgr) + delete local_sycl_device_mgr; + + if (use_gpu_mode == SYCL_MUL_GPU_MODE) { + local_sycl_device_mgr = + new sycl_device_mgr(SYCL_DEVICE_FILTER_TOP_LEVEL_ZERO); + } else { + GGML_ASSERT(main_gpu_id >= 0); + local_sycl_device_mgr = new sycl_device_mgr(SYCL_DEVICE_FILTER_ALL); + } + + device_count = local_sycl_device_mgr->get_device_count(); + + int64_t total_vram = 0; + + for (int i = 0; i < device_count; ++i) { + devices[i].vmm = 0; + dpct::device_info prop; + SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( + prop, dpct::dev_mgr::instance().get_device(i)))); + + default_tensor_split[i] = total_vram; + total_vram += prop.get_global_mem_size(); + + devices[i].cc = + 100 * prop.get_major_version() + 10 * prop.get_minor_version(); + } + + for (int id = 0; id < device_count; ++id) { + default_tensor_split[id] /= total_vram; + } + + g_work_group_size = work_group_size(main_gpu_id); + print_gpu_device_list(); + } + +}; struct ggml_sycl_pool { virtual ~ggml_sycl_pool() = default; @@ -262,16 +460,15 @@ struct ggml_backend_sycl_context { queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } }; - explicit ggml_backend_sycl_context(int device) : + explicit ggml_backend_sycl_context(struct ggml_sycl_device_info &sycl_device_info, int device) : device(device), name(GGML_SYCL_NAME + std::to_string(device)) { + qptrs[device][0] = sycl_device_info.local_sycl_device_mgr->queues[device]; } queue_ptr stream(int device, int stream) { - if (qptrs[device][stream] == nullptr) { - qptrs[device][stream] = &(dpct::get_current_device().default_queue()); - } - return qptrs[device][stream]; + assert(qptrs[device][0] != nullptr); + return qptrs[device][0]; } queue_ptr stream() { diff --git a/src/llama.cpp b/src/llama.cpp index eea532f6a..98a386e72 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17619,6 +17619,7 @@ struct llama_context * llama_new_context_with_model( #elif defined(GGML_USE_SYCL) // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { + ggml_sycl_set_single_device(model->main_gpu); ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);