From de2763118fd5b6ea89702cc9981349e0556b0c3d Mon Sep 17 00:00:00 2001
From: Jianyu Zhang <jianyu.zhang@intel.com>
Date: Wed, 19 Jun 2024 22:54:15 +0800
Subject: [PATCH] fix to support multiple GPUs, fix set single device, unify
 id/device_id/device_index

---
 ggml/include/ggml-sycl.h      |   6 +
 ggml/src/ggml-sycl.cpp        | 434 ++++++++++++++++++----------------
 ggml/src/ggml-sycl/common.hpp | 233 ++++++++++++++++--
 src/llama.cpp                 |   1 +
 4 files changed, 453 insertions(+), 221 deletions(-)

diff --git a/ggml/include/ggml-sycl.h b/ggml/include/ggml-sycl.h
index 43ab1519c..652de9520 100644
--- a/ggml/include/ggml-sycl.h
+++ b/ggml/include/ggml-sycl.h
@@ -34,6 +34,12 @@ GGML_API GGML_CALL void   ggml_sycl_get_device_description(int device, char *des
 GGML_API GGML_CALL int   ggml_backend_sycl_get_device_count();
 GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
 
+GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
+GGML_API GGML_CALL void ggml_sycl_set_single_device(int main_gpu_id);
+
+// GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
+// GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
+
 // SYCL doesn't support registering host memory, keep here for reference
 // GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
 // GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index 76bad57e2..68ebe8bf9 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -51,6 +51,90 @@ static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer);
 static inline int get_sycl_env(const char *env_name, int default_val);
 static inline int get_work_group_size(const sycl::device& device);
 
+static bool g_sycl_loaded = false;
+
+bool ggml_sycl_loaded(void) {
+    return g_sycl_loaded;
+}
+
+int get_sycl_env(const char *env_name, int default_val) {
+    char *user_device_string = getenv(env_name);
+    int user_number = default_val;
+
+    unsigned n;
+    if (user_device_string != NULL &&
+        sscanf(user_device_string, " %u", &n) == 1) {
+        user_number = (int)n;
+    } else {
+        user_number = default_val;
+    }
+    return user_number;
+}
+
+static ggml_sycl_device_info ggml_sycl_init() try {
+    static bool initialized = false;
+
+    if (!initialized) {
+        fprintf(stderr, "[SYCL] call ggml_init_sycl\n");
+
+        g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
+        fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__,
+                g_ggml_sycl_debug);
+
+#if defined(GGML_SYCL_F16)
+        fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__);
+#else
+        fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
+#endif
+
+#if defined(GGML_SYCL_FORCE_MMQ)
+        fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   yes\n", __func__);
+#else
+        fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   no\n", __func__);
+#endif
+
+#if defined(SYCL_USE_XMX)
+        fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
+#else
+        fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
+#endif
+
+        if (CHECK_TRY_ERROR(g_all_sycl_device_count =
+                                dpct::dev_mgr::instance().device_count()) !=
+            0) {
+            initialized = true;
+            g_sycl_loaded = false;
+            return;
+        }
+        GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
+        ggml_backend_sycl_print_sycl_devices();
+        initialized = true;
+        g_sycl_loaded = true;
+    }
+
+    static ggml_sycl_device_info info = {};
+    info.refresh_device(SYCL_MUL_GPU_MODE);
+
+    if (info.device_count == 0) {
+        fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": %s\n",
+                __func__);
+        return info;
+    }
+    GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES);
+
+    return info;
+} catch (sycl::exception const &exc) {
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+              << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
+}
+
+
+ggml_sycl_device_info &ggml_sycl_info() {
+    static ggml_sycl_device_info info = ggml_sycl_init();
+    return info;
+}
+
 void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
                     const void *ptr_src, size_t size) {
     char *host_buf = (char *)malloc(size);
@@ -2016,12 +2100,6 @@ static void im2col_sycl(const float *x, T *dst, int IW, int IH,
 }
 
 
-static bool g_sycl_loaded = false;
-
-bool ggml_sycl_loaded(void) {
-    return g_sycl_loaded;
-}
-
 void print_device_detail(int id, sycl::device &device, std::string device_type) {
 
     dpct::device_info prop;
@@ -2121,67 +2199,20 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static ggml_sycl_device_info ggml_sycl_init() {
-    ggml_sycl_device_info info = {};
-
-    info.device_count = dpct::dev_mgr::instance().device_count();
-    if (info.device_count == 0) {
-        fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__);
-        return info;
-    }
-
-    GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES);
-
-    int64_t total_vram = 0;
-#if defined(GGML_SYCL_FORCE_MMQ)
-    fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   yes\n", __func__);
-#else
-    fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   no\n", __func__);
-#endif
-#if defined(SYCL_USE_XMX)
-    fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
-#else
-    fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
-#endif
-    fprintf(stderr, "%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count);
-
-    for (int i = 0; i < info.device_count; ++i) {
-        info.devices[i].vmm = 0;
-        dpct::device_info prop;
-        SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
-            prop, dpct::dev_mgr::instance().get_device(i))));
-
-        info.default_tensor_split[i] = total_vram;
-        total_vram += prop.get_global_mem_size();
-
-        info.devices[i].cc =
-            100 * prop.get_major_version() + 10 * prop.get_minor_version();
-    }
-
-    for (int id = 0; id < info.device_count; ++id) {
-        info.default_tensor_split[id] /= total_vram;
-    }
-    return info;
+GGML_API GGML_CALL void ggml_sycl_set_single_device(int device_id) {
+    ggml_sycl_info().refresh_device(SYCL_SINGLE_GPU_MODE, device_id);
+    ggml_sycl_set_main_device(device_id);
 }
 
-const ggml_sycl_device_info & ggml_sycl_info() {
-    static ggml_sycl_device_info info = ggml_sycl_init();
-    return info;
-}
-
-/*
-device_index: device index from 0 to n (continue numbers).
-    It is used for device select/set in SYCL backend internal data structure.
-*/
-inline void check_allow_gpu_index(const int device_index) {
-  if (device_index >= ggml_sycl_info().device_count) {
+inline void check_allow_device_id(const int device_id) {
+  if (device_id >= ggml_sycl_info().device_count) {
     char error_buf[256];
     snprintf(
         error_buf,
         sizeof(error_buf),
-        "%s error: device_index:%d is out of range: [0-%d]",
+        "%s error: device_id:%d is out of range: [0-%d]",
         __func__,
-        device_index,
+        device_id,
         ggml_sycl_info().device_count - 1);
     fprintf(stderr, "%s\n", error_buf);
     assert(false);
@@ -2192,7 +2223,7 @@ inline void check_allow_gpu_index(const int device_index) {
 struct ggml_sycl_pool_leg : public ggml_sycl_pool {
     static const int MAX_SYCL_BUFFERS = 256;
 
-    int device;
+    int device_id;
     queue_ptr qptr;
     struct ggml_sycl_buffer {
         void * ptr = nullptr;
@@ -2204,7 +2235,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
 
     explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) :
         qptr(qptr_),
-        device(device_) {
+        device_id(device_) {
     }
 
     ~ggml_sycl_pool_leg() {
@@ -2288,12 +2319,12 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
     }
 };
 
-std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
+std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device_id) {
     // TBD: NO VMM support
-    // if (ggml_sycl_info().devices[device].vmm) {
-    //     return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(device));
+    // if (ggml_sycl_info().devices[device_id].vmm) {
+    //     return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(device_id));
     // }
-   return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
+   return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device_id));
 }
 
 // TBD pool with virtual memory management
@@ -2704,13 +2735,13 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor
 static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split) {
     int64_t min_compute_capability = INT_MAX;
     int64_t max_compute_capability = INT_MIN;
-    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-        if (tensor_split[i] < (i + 1 < ggml_sycl_info().device_count ? tensor_split[i + 1] : 1.0f)) {
-            if (min_compute_capability > ggml_sycl_info().devices[i].cc) {
-                min_compute_capability = ggml_sycl_info().devices[i].cc;
+    for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+        if (tensor_split[id] < (id + 1 < ggml_sycl_info().device_count ? tensor_split[id + 1] : 1.0f)) {
+            if (min_compute_capability > ggml_sycl_info().devices[id].cc) {
+                min_compute_capability = ggml_sycl_info().devices[id].cc;
             }
-            if (max_compute_capability < ggml_sycl_info().devices[i].cc) {
-                max_compute_capability = ggml_sycl_info().devices[i].cc;
+            if (max_compute_capability < ggml_sycl_info().devices[id].cc) {
+                max_compute_capability = ggml_sycl_info().devices[id].cc;
             }
         }
     }
@@ -3132,18 +3163,18 @@ static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
     }
 
 #ifdef NDEBUG
-    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-        SYCL_CHECK(ggml_sycl_set_device(i));
+    for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+        SYCL_CHECK(ggml_sycl_set_device(id));
     }
 
-    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-        SYCL_CHECK(ggml_sycl_set_device(i));
+    for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+        SYCL_CHECK(ggml_sycl_set_device(id));
 
         for (int id_other = 0; id_other < ggml_sycl_info().device_count; ++id_other) {
-            if (i == id_other) {
+            if (id == id_other) {
                 continue;
             }
-            if (i != main_device && id_other != main_device) {
+            if (id != main_device && id_other != main_device) {
                 continue;
             }
 
@@ -3240,62 +3271,62 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
     int used_devices = 0;
     queue_ptr main_stream = ctx.stream();
 
-    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+    for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
         // by default, use all rows
-        dev[i].row_low  = 0;
-        dev[i].row_high = ne01;
+        dev[id].row_low  = 0;
+        dev[id].row_high = ne01;
 
         // for multi GPU, get the row boundaries from tensor split
         // and round to mul_mat_q tile sizes
         if (split) {
             const int64_t rounding = get_row_rounding(src0->type, tensor_split);
 
-            if (i != 0) {
-                dev[i].row_low  = ne01*tensor_split[i];
-                if (dev[i].row_low < ne01) {
-                    dev[i].row_low -= dev[i].row_low % rounding;
+            if (id != 0) {
+                dev[id].row_low  = ne01*tensor_split[id];
+                if (dev[id].row_low < ne01) {
+                    dev[id].row_low -= dev[id].row_low % rounding;
                 }
             }
 
-            if (i != ggml_sycl_info().device_count - 1) {
-                dev[i].row_high  = ne01*tensor_split[i + 1];
-                if (dev[i].row_high < ne01) {
-                    dev[i].row_high -= dev[i].row_high % rounding;
+            if (id != ggml_sycl_info().device_count - 1) {
+                dev[id].row_high  = ne01*tensor_split[id + 1];
+                if (dev[id].row_high < ne01) {
+                    dev[id].row_high -= dev[id].row_high % rounding;
                 }
             }
         }
     }
 
-    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-        if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) {
+    for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+        if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
             continue;
         }
 
         used_devices++;
 
-        const bool src1_on_device = i == ctx.device;
-        const bool  dst_on_device = i == ctx.device;
+        const bool src1_on_device = id == ctx.device;
+        const bool  dst_on_device = id == ctx.device;
 
-        ggml_sycl_set_device(i);
-        queue_ptr stream = ctx.stream(i, 0);
+        ggml_sycl_set_device(id);
+        queue_ptr stream = ctx.stream(id, 0);
 
         if (src0_is_contiguous) {
-            dev[i].src0_dd = (char *) src0->data;
+            dev[id].src0_dd = (char *) src0->data;
         } else {
-            dev[i].src0_dd = dev[i].src0_dd_alloc.alloc(ctx.pool(i), ggml_nbytes(src0));
+            dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
         }
 
         if (src1_on_device && src1_is_contiguous) {
-            dev[i].src1_ddf = (float *) src1->data;
+            dev[id].src1_ddf = (float *) src1->data;
         } else {
-            dev[i].src1_ddf = dev[i].src1_ddf_alloc.alloc(ctx.pool(i), ggml_nelements(src1));
+            dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
         }
 
         if (convert_src1_to_q8_1) {
-            dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
+            dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
 
             if (src1_on_device && src1_is_contiguous) {
-                quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
+                quantize_row_q8_1_sycl(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
                 /*
                 DPCT1010:90: SYCL uses exceptions to report errors and does not
                 use the error codes. The call was replaced with 0. You need to
@@ -3306,10 +3337,10 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
         }
 
         if (dst_on_device) {
-            dev[i].dst_dd = (float *) dst->data;
+            dev[id].dst_dd = (float *) dst->data;
         } else {
-            const size_t size_dst_ddf = split ? (dev[i].row_high - dev[i].row_low)*ne1 : ggml_nelements(dst);
-            dev[i].dst_dd = dev[i].dst_dd_alloc.alloc(ctx.pool(i), size_dst_ddf);
+            const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
+            dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(ctx.pool(id), size_dst_ddf);
         }
     }
 
@@ -3332,20 +3363,20 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
         const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_SYCL_MAX_STREAMS : 0;
         const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
 
-        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-            if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) {
+        for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+            if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
                 continue;
             }
 
-            const bool src1_on_device = i == ctx.device;
-            const bool  dst_on_device = i == ctx.device;
-            const int64_t row_diff = dev[i].row_high - dev[i].row_low;
+            const bool src1_on_device = id == ctx.device;
+            const bool  dst_on_device = id == ctx.device;
+            const int64_t row_diff = dev[id].row_high - dev[id].row_low;
 
-            ggml_sycl_set_device(i);
-            queue_ptr stream = ctx.stream(i, is);
+            ggml_sycl_set_device(id);
+            queue_ptr stream = ctx.stream(id, is);
 
             // wait for main GPU data if necessary
-            if (split && (i != ctx.device || is != 0)) {
+            if (split && (id != ctx.device || is != 0)) {
                 /*
                 DPCT1009:163: SYCL uses exceptions to report errors and does not
                 use the error codes. The original code was commented out and a
@@ -3362,20 +3393,20 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
                 const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
 
                 // for split tensors the data begins at i0 == i0_offset_low
-                char  *  src0_dd_i =  dev[i].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
-                float * src1_ddf_i = dev[i].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
-                char  * src1_ddq_i = dev[i].src1_ddq +  src1_ddq_i_offset;
-                float *   dst_dd_i =   dev[i].dst_dd + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);
+                char  *  src0_dd_i =  dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
+                float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
+                char  * src1_ddq_i = dev[id].src1_ddq +  src1_ddq_i_offset;
+                float *   dst_dd_i =   dev[id].dst_dd + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);
 
                 // the main device memory buffer can be on VRAM scratch, with space for all partial results
                 // in that case an offset on dst_ddf_i is needed
-                if (i == ctx.device) {
-                    dst_dd_i += dev[i].row_low; // offset is 0 if no tensor split
+                if (id == ctx.device) {
+                    dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
                 }
 
                 // copy src0, src1 to device if necessary
                 if (src1_is_contiguous) {
-                    if (i != ctx.device) {
+                    if (id != ctx.device) {
                         if (convert_src1_to_q8_1) {
                             char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
                           SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
@@ -3410,14 +3441,14 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
                 }
 
                 if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
-                    SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[i].row_low, dev[i].row_high, stream));
+                    SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
                 }
                 if (src1->type == GGML_TYPE_F16) {
                     src1_padded_col_size = (i0 * ne11 + src1_col_0) * ne10;
                 }
                 // do the computation
                 SYCL_CHECK(CHECK_TRY_ERROR(op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
-                    dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream)));
+                    dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream)));
                 /*
                 DPCT1010:93: SYCL uses exceptions to report errors and does not
                 use the error codes. The call was replaced with 0. You need to
@@ -3436,7 +3467,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
                         // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
                         float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
                         GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
-                        dhf_dst_i += src1_col_0*ne0 + dev[i].row_low;
+                        dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
 
                         SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
                             dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
@@ -3453,7 +3484,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
                 }
 
                 // add event for the main device to wait on until other device is done
-                if (split && (i != ctx.device || is != 0)) {
+                if (split && (id != ctx.device || is != 0)) {
                     /*
                     DPCT1024:94: The original code returned the error code that
                     was further consumed by the program logic. This original
@@ -3461,7 +3492,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
                     program logic consuming the error code.
                     */
                     SYCL_CHECK(CHECK_TRY_ERROR(
-                        *src0_extra->events[i][is] =
+                        *src0_extra->events[id][is] =
                             stream->ext_oneapi_submit_barrier()));
                 }
             }
@@ -3474,14 +3505,14 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
         is_max = is_max <= GGML_SYCL_MAX_STREAMS ? is_max : GGML_SYCL_MAX_STREAMS;
 
         ggml_sycl_set_device(ctx.device);
-        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-            if (dev[i].row_low == dev[i].row_high) {
+        for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+            if (dev[id].row_low == dev[id].row_high) {
                 continue;
             }
             for (int64_t is = 0; is < is_max; ++is) {
                 SYCL_CHECK(CHECK_TRY_ERROR(
                     ctx.stream()->ext_oneapi_submit_barrier(
-                        {*src0_extra->events[i][is]})));
+                        {*src0_extra->events[id][is]})));
             }
         }
     }
@@ -4310,17 +4341,17 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
     return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
 }
 
-void ggml_sycl_set_main_device(const int main_device) try {
-    if (dpct::get_current_device_id() == main_device) return;
-    check_allow_gpu_index(main_device);
-    dpct::select_device(main_device);
+void ggml_sycl_set_main_device(const int main_device_id) try {
+    if (dpct::get_current_device_id() == main_device_id) return;
+    check_allow_device_id(main_device_id);
+    dpct::select_device(main_device_id);
 
     if (g_ggml_sycl_debug) {
         dpct::device_info prop;
         SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
-            prop, dpct::dev_mgr::instance().get_device(main_device))));
+            prop, dpct::dev_mgr::instance().get_device(main_device_id))));
         fprintf(stderr, "Using device %d (%s) as main device\n",
-                main_device, prop.get_name());
+                main_device_id, prop.get_name());
     }
 }
 catch (sycl::exception const &exc) {
@@ -4475,9 +4506,9 @@ GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len) try
     GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
     for(int i=0;i<max_len;i++) id_list[i] = -1;
 
-    for (int i=0;i< ggml_sycl_info().device_count;i++){
-        if (i>=max_len) break;
-        id_list[i] = i;
+    for (int id=0;id< ggml_sycl_info().device_count;id++){
+        if (id>=max_len) break;
+        id_list[id] = id;
     }
     return;
 }
@@ -4501,12 +4532,12 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
+GGML_API GGML_CALL void ggml_sycl_get_device_description(int device_id, char *description,
                                       size_t description_size) try {
     GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
     dpct::device_info prop;
     SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
-        prop, dpct::dev_mgr::instance().get_device(device))));
+        prop, dpct::dev_mgr::instance().get_device(device_id))));
     snprintf(description, description_size, "%s", prop.get_name());
 }
 catch (sycl::exception const &exc) {
@@ -4515,10 +4546,10 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
+GGML_CALL void ggml_backend_sycl_get_device_memory(int device_id, size_t *free,
                                                    size_t *total) try {
     GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
-    ggml_sycl_set_device(device);
+    ggml_sycl_set_device(device_id);
 
     /*
     DPCT1009:218: SYCL uses exceptions to report errors and does not use the
@@ -4531,7 +4562,7 @@ GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
     You may need to adjust the code.
     */
     SYCL_CHECK(CHECK_TRY_ERROR(
-        dpct::dev_mgr::instance().get_device(device).get_memory_info(*free, *total)));
+        dpct::dev_mgr::instance().get_device(device_id).get_memory_info(*free, *total)));
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -4553,9 +4584,9 @@ struct ggml_backend_sycl_buffer_context {
     queue_ptr stream;
     std::string name;
 
-     ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) :
-        device(device), dev_ptr(dev_ptr), stream(stream) {
-            check_allow_gpu_index(device);
+     ggml_backend_sycl_buffer_context(int device_id, void * dev_ptr, queue_ptr stream) :
+        device(device_id), dev_ptr(dev_ptr), stream(stream) {
+            check_allow_device_id(device);
             name = (GGML_SYCL_NAME + std::to_string(device));
         }
 
@@ -4831,58 +4862,58 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
     /* .is_host          = */ nullptr,
 };
 
-ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
+ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) {
     static std::mutex mutex;
     std::lock_guard<std::mutex> lock(mutex);
 
     GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
 
-    if (device>=ggml_sycl_info().device_count or device<0) {
-        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
-            device, ggml_sycl_info().device_count-1);
-        GGML_ASSERT(device<ggml_sycl_info().device_count);
+    if (device_id>=ggml_sycl_info().device_count or device_id<0) {
+        printf("ggml_backend_sycl_buffer_type error: device_id:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
+            device_id, ggml_sycl_info().device_count-1);
+        GGML_ASSERT(device_id<ggml_sycl_info().device_count);
     }
     static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
 
     static bool ggml_backend_sycl_buffer_type_initialized = false;
 
     if (!ggml_backend_sycl_buffer_type_initialized) {
-        for (int i = 0; i < ggml_sycl_info().device_count; i++) {
-            auto & device_i = dpct::dev_mgr::instance().get_device(i);
-            queue_ptr stream = &(device_i.default_queue());
-            ggml_backend_sycl_buffer_types[i] = {
+        for (int id = 0; id < ggml_sycl_info().device_count; id++) {
+            auto & device = dpct::dev_mgr::instance().get_device(id);
+            queue_ptr stream = &(device.default_queue());
+            ggml_backend_sycl_buffer_types[id] = {
                 /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
-                /* .context  = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), stream},
+                /* .context  = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), stream},
             };
         }
         ggml_backend_sycl_buffer_type_initialized = true;
     }
-    return &ggml_backend_sycl_buffer_types[device];
+    return &ggml_backend_sycl_buffer_types[device_id];
 }
 
 ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
     GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
 
-    int device = ctx->device;
-    if (device>=ggml_sycl_info().device_count or device<0) {
-        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
-            device, ggml_sycl_info().device_count-1);
-        GGML_ASSERT(device<ggml_sycl_info().device_count);
+    int device_id = ctx->device;
+    if (device_id>=ggml_sycl_info().device_count or device_id<0) {
+        printf("ggml_backend_sycl_buffer_type error: device_id:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
+            device_id, ggml_sycl_info().device_count-1);
+        GGML_ASSERT(device_id<ggml_sycl_info().device_count);
     }
     static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
 
     static bool ggml_backend_sycl_buffer_type_initialized = false;
 
     if (!ggml_backend_sycl_buffer_type_initialized) {
-        for (int i = 0; i < ggml_sycl_info().device_count; i++) {
-            ggml_backend_sycl_buffer_types[i] = {
+        for (int id = 0; id < ggml_sycl_info().device_count; id++) {
+            ggml_backend_sycl_buffer_types[id] = {
                 /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
-                /* .context  = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), ctx->stream(i, 0)},
+                /* .context  = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), ctx->stream(id, 0)},
             };
         }
         ggml_backend_sycl_buffer_type_initialized = true;
     }
-    return &ggml_backend_sycl_buffer_types[device];
+    return &ggml_backend_sycl_buffer_types[device_id];
 }
 
 // sycl split buffer type
@@ -4903,9 +4934,9 @@ static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tens
 struct ggml_backend_sycl_split_buffer_context {
     ~ggml_backend_sycl_split_buffer_context() try {
         for (ggml_tensor_extra_gpu * extra : tensor_extras) {
-            for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+            for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
                 for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
-                    if (extra->events[i][is] != nullptr) {
+                    if (extra->events[id][is] != nullptr) {
                         /*
                         DPCT1009:206: SYCL uses exceptions to report errors and
                         does not use the error codes. The original code was
@@ -4913,19 +4944,19 @@ struct ggml_backend_sycl_split_buffer_context {
                         need to rewrite this code.
                         */
                         SYCL_CHECK(CHECK_TRY_ERROR(
-                            dpct::destroy_event(extra->events[i][is])));
+                            dpct::destroy_event(extra->events[id][is])));
                     }
                 }
-                if (extra->data_device[i] != nullptr) {
+                if (extra->data_device[id] != nullptr) {
                     /*
                     DPCT1009:207: SYCL uses exceptions to report errors and does
                     not use the error codes. The original code was commented out
                     and a warning string was inserted. You need to rewrite this
                     code.
                     */
-                    ggml_sycl_set_device(i);
+                    ggml_sycl_set_device(id);
                     SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(
-                        extra->data_device[i], *(streams[i]))));
+                        extra->data_device[id], *(streams[id]))));
                 }
             }
             delete extra;
@@ -4978,9 +5009,9 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
     ctx->tensor_extras.push_back(extra);
         ctx->streams.push_back(&(dpct::get_current_device().default_queue()));
 
-    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+    for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
         int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);
+        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
 
         int64_t nrows_split = row_high - row_low;
         if (nrows_split == 0) {
@@ -4997,8 +5028,8 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
 
         // FIXME: do not crash if cudaMalloc fails
         // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
-        ggml_sycl_set_device(i);
-        const queue_ptr stream = ctx->streams[i];
+        ggml_sycl_set_device(id);
+        const queue_ptr stream = ctx->streams[id];
         char * buf;
         /*
         DPCT1009:208: SYCL uses exceptions to report errors and does not use the
@@ -5021,7 +5052,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
                     .wait()));
         }
 
-        extra->data_device[i] = buf;
+        extra->data_device[id] = buf;
 
         for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
             /*
@@ -5030,7 +5061,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
             string was inserted. You need to rewrite this code.
             */
             SYCL_CHECK(
-                CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event()));
+                CHECK_TRY_ERROR(extra->events[id][is] = new sycl::event()));
         }
     }
     tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT;
@@ -5057,9 +5088,9 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
     const size_t nb1 = tensor->nb[1];
     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
 
-    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+    for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
         int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);
+        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
 
         int64_t nrows_split = row_high - row_low;
         if (nrows_split == 0) {
@@ -5081,11 +5112,11 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
         error codes. The original code was commented out and a warning string
         was inserted. You need to rewrite this code.
         */
-        ggml_sycl_set_device(i);
-        const queue_ptr stream = ctx->streams[i];
+        ggml_sycl_set_device(id);
+        const queue_ptr stream = ctx->streams[id];
         SYCL_CHECK(CHECK_TRY_ERROR(
             (*stream)
-                .memcpy(extra->data_device[i], buf_host, original_size)
+                .memcpy(extra->data_device[id], buf_host, original_size)
                 .wait()));
     }
 }
@@ -5110,9 +5141,9 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
     const size_t nb1 = tensor->nb[1];
     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
 
-    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+    for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
         int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);
+        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
 
         int64_t nrows_split = row_high - row_low;
         if (nrows_split == 0) {
@@ -5134,11 +5165,11 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
         error codes. The original code was commented out and a warning string
         was inserted. You need to rewrite this code.
         */
-        ggml_sycl_set_device(i);
-        const queue_ptr stream = ctx->streams[i];
+        ggml_sycl_set_device(id);
+        const queue_ptr stream = ctx->streams[id];
         SYCL_CHECK(CHECK_TRY_ERROR(
             (*stream)
-                .memcpy(buf_host, extra->data_device[i], original_size)
+                .memcpy(buf_host, extra->data_device[id], original_size)
                 .wait()));
     }
 }
@@ -5193,9 +5224,9 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_
 
     const int64_t ne0 = tensor->ne[0];
 
-    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+    for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
         int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, i);
+        get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
 
         int64_t nrows_split = row_high - row_low;
         if (nrows_split == 0) {
@@ -5244,12 +5275,12 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const f
         tensor_split_arr = ggml_sycl_info().default_tensor_split;
     } else {
         float split_sum = 0.0f;
-        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-            tensor_split_arr[i] = split_sum;
-            split_sum += tensor_split[i];
+        for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+            tensor_split_arr[id] = split_sum;
+            split_sum += tensor_split[id];
         }
-        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-            tensor_split_arr[i] /= split_sum;
+        for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+            tensor_split_arr[id] /= split_sum;
         }
     }
 
@@ -5613,13 +5644,10 @@ static ggml_guid_t ggml_backend_sycl_guid() {
     return &guid;
 }
 
-GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
-    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
-    ggml_check_sycl();
-
-    check_allow_gpu_index(device);
-
-    ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context(device);
+GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device_id) {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init, device_id=%d\n", device_id);
+    check_allow_device_id(device_id);
+    ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context(ggml_sycl_info(), device_id);
     if (ctx == nullptr) {
         fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
         return nullptr;
@@ -5654,10 +5682,10 @@ extern "C" int ggml_backend_sycl_reg_devices();
 
 int ggml_backend_sycl_reg_devices() {
     assert(ggml_sycl_info().device_count>0);
-    for (int i = 0; i < ggml_sycl_info().device_count; i++) {
+    for (int id = 0; id < ggml_sycl_info().device_count; id++) {
         char name[128];
-        snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, i);
-        ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(i), (void *) (intptr_t) i);
+        snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, id);
+        ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(id), (void *) (intptr_t) id);
     }
     return ggml_sycl_info().device_count;
 }
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index dfd4a7c2c..518b4e4c1 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -49,7 +49,7 @@ static int g_ggml_sycl_debug = 0;
 
 // #define DEBUG_SYCL_MALLOC
 
-static int g_work_group_size = 0;
+static int g_work_group_size = -1;
 // typedef sycl::half ggml_fp16_t;
 
 #define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
@@ -89,6 +89,11 @@ enum ggml_sycl_backend_gpu_mode {
   SYCL_MUL_GPU_MODE
 };
 
+enum ggml_sycl_backend_device_filter {
+  SYCL_DEVICE_FILTER_ALL = 0,
+  SYCL_DEVICE_FILTER_TOP_LEVEL_ZERO
+};
+
 static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 
 static void crash() {
@@ -158,18 +163,18 @@ static size_t g_scratch_offset = 0;
 
 int get_current_device_id();
 
-inline dpct::err0 ggml_sycl_set_device(const int device) try {
+inline dpct::err0 ggml_sycl_set_device(const int device_id) try {
 
   int current_device_id;
   SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id()));
 
-  // GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d,
-  // current_device_id=%d\n", device, current_device);
-  if (device == current_device_id) {
+  GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d, current_device_id=%d\n", device_id, current_device_id);
+  if (device_id == current_device_id) {
     return 0;
   }
 
-  return CHECK_TRY_ERROR(dpct::select_device(device));
+  return CHECK_TRY_ERROR(dpct::select_device(device_id));
+
 } catch (sycl::exception const& exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
             << ", line:" << __LINE__ << std::endl;
@@ -177,25 +182,218 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
   std::exit(1);
 }
 
-//////////////////////
+class sycl_device_mgr {
+  public:
+    std::vector<int> device_ids;
+    std::vector<sycl::device> devices;
+    std::vector<int> max_compute_units;
+    std::vector<int> work_group_sizes;
+    sycl::queue *first_queue;
+    std::vector<sycl::queue *> queues;
+    std::vector<sycl::context> ctxs;
+    std::string device_list = "";
+
+    sycl_device_mgr(ggml_sycl_backend_device_filter device_filter) {
+        if (device_filter == SYCL_DEVICE_FILTER_TOP_LEVEL_ZERO) {
+            detect_sycl_gpu_list_with_max_cu();
+            create_context_for_group_gpus();
+        } else {
+            detect_all_sycl_device_list();
+            create_context_queue_for_devices();
+        }
+        get_allow_devices();
+    }
+
+    /*
+    Bind all gpus in same host with same context, for better performance in
+    device-to-device copy in the future.
+    */
+    void create_context_for_group_gpus() {
+        sycl::context ctx = sycl::context(devices);
+        assert(device_ids.size() > 0);
+        first_queue = dpct::get_current_device().create_queue(ctx, devices[0]);
+        sycl::context ctx0 = first_queue->get_context();
+        for (int i = 0; i < device_ids.size(); i++) {
+            ctxs.push_back(ctx0);
+            dpct::select_device(device_ids[i]);
+            queues.push_back(
+                dpct::get_current_device().create_queue(ctx0, devices[i]));
+        }
+    }
+
+    void create_context_queue_for_devices() {
+        for (int i = 0; i < device_ids.size(); i++) {
+            sycl::context ctx = sycl::context(devices[i]);
+            ctxs.push_back(ctx);
+            dpct::select_device(device_ids[i]);
+            queues.push_back(
+                dpct::get_current_device().create_queue(ctx, devices[i]));
+        }
+    }
+
+    void get_allow_devices() {
+        device_list = "";
+        for (size_t i = 0; i < device_ids.size(); ++i) {
+            device_list += std::to_string(device_ids[i]);
+            device_list += ",";
+        }
+        if (device_list.length() > 1) {
+            device_list.pop_back();
+        }
+    }
+
+    bool is_allowed_device(int device_id) {
+        return std::find(device_ids.begin(), device_ids.end(), device_id) != device_ids.end();
+    }
+
+    void detect_all_sycl_device_list() try {
+        int device_count = dpct::dev_mgr::instance().device_count();
+
+        for (int id = 0; id < device_count; id++) {
+            sycl::device device = dpct::dev_mgr::instance().get_device(id);
+            device_ids.push_back(id);
+            devices.push_back(device);
+            dpct::device_info prop;
+            dpct::get_device_info(prop, device);
+            work_group_sizes.push_back(prop.get_max_work_group_size());
+            max_compute_units.push_back(prop.get_max_compute_units());
+        }
+        return;
+    } catch (sycl::exception const &exc) {
+        std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+                  << ", line:" << __LINE__ << std::endl;
+        std::exit(1);
+    }
+
+    /*
+    Use all GPUs with same top max compute units
+    */
+    void detect_sycl_gpu_list_with_max_cu() try {
+        int device_count = dpct::dev_mgr::instance().device_count();
+        int local_max_compute_units = 0;
+        for (int id = 0; id < device_count; id++) {
+            sycl::device device = dpct::dev_mgr::instance().get_device(id);
+            if (!device.is_gpu())
+                continue;
+            dpct::device_info prop;
+            dpct::get_device_info(prop, device);
+            if (local_max_compute_units < prop.get_max_compute_units())
+                local_max_compute_units = prop.get_max_compute_units();
+        }
+
+        for (int id = 0; id < device_count; id++) {
+            sycl::device device = dpct::dev_mgr::instance().get_device(id);
+            if (!device.is_gpu())
+                continue;
+            dpct::device_info prop;
+            dpct::get_device_info(prop, device);
+            if (local_max_compute_units == prop.get_max_compute_units() &&
+                is_ext_oneapi_device(device)) {
+                device_ids.push_back(id);
+                devices.push_back(device);
+                work_group_sizes.push_back(prop.get_max_work_group_size());
+                max_compute_units.push_back(prop.get_max_compute_units());
+            }
+        }
+        return;
+    } catch (sycl::exception const &exc) {
+        std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+                  << ", line:" << __LINE__ << std::endl;
+        std::exit(1);
+    }
+
+    int get_device_count() { return (int)device_ids.size(); }
+
+    bool is_ext_oneapi_device(const sycl::device &dev) {
+        sycl::backend dev_backend = dev.get_backend();
+        if (dev_backend == sycl::backend::ext_oneapi_level_zero ||
+            dev_backend == sycl::backend::ext_oneapi_cuda ||
+            dev_backend == sycl::backend::ext_oneapi_hip)
+            return true;
+        return false;
+    }
+};
 
 struct ggml_sycl_device_info {
     int device_count;
-
+    int main_gpu_id = -1;
+    ggml_sycl_backend_gpu_mode use_gpu_mode = SYCL_MUL_GPU_MODE;
     struct sycl_device_info {
-        int     cc;                 // compute capability
+        int cc; // compute capability
         // int     nsm;                // number of streaming multiprocessors
         // size_t  smpb;               // max. shared memory per block
-        bool    vmm;                // virtual memory support
-        size_t  total_vram;
+        bool vmm; // virtual memory support
+        size_t total_vram;
     };
 
     sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
 
     std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
-};
 
-const ggml_sycl_device_info & ggml_sycl_info();
+    sycl_device_mgr *local_sycl_device_mgr = NULL;
+
+    void print_gpu_device_list() {
+        GGML_ASSERT(local_sycl_device_mgr);
+
+        char *hint = NULL;
+        if (use_gpu_mode == SYCL_MUL_GPU_MODE) {
+            hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n";
+            fprintf(stderr, hint, local_sycl_device_mgr->get_device_count(),
+                    local_sycl_device_mgr->device_list.c_str(),
+                    local_sycl_device_mgr->max_compute_units[main_gpu_id]);
+        } else {
+            hint = "use main device [%d] with Max compute units:%d\n";
+            fprintf(stderr, hint, main_gpu_id,
+                    local_sycl_device_mgr->max_compute_units[main_gpu_id]);
+        }
+    }
+
+    int work_group_size(int device_id) {
+        GGML_ASSERT(local_sycl_device_mgr);
+        return local_sycl_device_mgr->work_group_sizes[device_id];
+    }
+
+    void refresh_device(ggml_sycl_backend_gpu_mode gpu_model,
+                        int p_main_gpu_id = 0) {
+        main_gpu_id = p_main_gpu_id;
+        use_gpu_mode = gpu_model;
+        if (!local_sycl_device_mgr)
+            delete local_sycl_device_mgr;
+
+        if (use_gpu_mode == SYCL_MUL_GPU_MODE) {
+            local_sycl_device_mgr =
+                new sycl_device_mgr(SYCL_DEVICE_FILTER_TOP_LEVEL_ZERO);
+        } else {
+            GGML_ASSERT(main_gpu_id >= 0);
+            local_sycl_device_mgr = new sycl_device_mgr(SYCL_DEVICE_FILTER_ALL);
+        }
+
+        device_count = local_sycl_device_mgr->get_device_count();
+
+        int64_t total_vram = 0;
+
+        for (int i = 0; i < device_count; ++i) {
+            devices[i].vmm = 0;
+            dpct::device_info prop;
+            SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
+                prop, dpct::dev_mgr::instance().get_device(i))));
+
+            default_tensor_split[i] = total_vram;
+            total_vram += prop.get_global_mem_size();
+
+            devices[i].cc =
+                100 * prop.get_major_version() + 10 * prop.get_minor_version();
+        }
+
+        for (int id = 0; id < device_count; ++id) {
+            default_tensor_split[id] /= total_vram;
+        }
+
+        g_work_group_size = work_group_size(main_gpu_id);
+        print_gpu_device_list();
+    }
+
+};
 
 struct ggml_sycl_pool {
     virtual ~ggml_sycl_pool() = default;
@@ -262,16 +460,15 @@ struct ggml_backend_sycl_context {
 
     queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
 
-    explicit ggml_backend_sycl_context(int device) :
+    explicit ggml_backend_sycl_context(struct ggml_sycl_device_info &sycl_device_info, int device) :
         device(device),
         name(GGML_SYCL_NAME + std::to_string(device)) {
+            qptrs[device][0] = sycl_device_info.local_sycl_device_mgr->queues[device];
     }
 
     queue_ptr stream(int device, int stream) {
-        if (qptrs[device][stream] == nullptr) {
-            qptrs[device][stream] = &(dpct::get_current_device().default_queue());
-        }
-        return qptrs[device][stream];
+        assert(qptrs[device][0] != nullptr);
+        return qptrs[device][0];
     }
 
     queue_ptr stream() {
diff --git a/src/llama.cpp b/src/llama.cpp
index eea532f6a..98a386e72 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17619,6 +17619,7 @@ struct llama_context * llama_new_context_with_model(
 #elif defined(GGML_USE_SYCL)
         // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
         if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
+            ggml_sycl_set_single_device(model->main_gpu);
             ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
             if (backend == nullptr) {
                 LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);