rebase

2024-07-02 13:08:56 +08:00 · 2024-07-02 13:08:56 +08:00 · de91f8eee6
commit de91f8eee6
parent d08c20edde
9 changed files with 793 additions and 376 deletions
--- a/README-sycl.md
+++ b/README-sycl.md
@ -296,15 +296,25 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
 A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
 ```
 found 6 SYCL devices:
-|  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
-|ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
-|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
-| 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
-| 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
-| 2|    [opencl:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       3.0|        512|    1024|     32|    16225243136|
-| 3|    [opencl:gpu:1]|                    Intel(R) UHD Graphics 770|       3.0|         32|     512|     32|    53651849216|
-| 4|    [opencl:cpu:0]|         13th Gen Intel(R) Core(TM) i7-13700K|       3.0|         24|    8192|     64|    67064815616|
-| 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|
+Part1:
+|ID|        Device Type| Ver|                                   Name|Global mem size|
+|--|-------------------|----|---------------------------------------|---------------|
+| 0| [level_zero:gpu:0]| 1.3|         Intel Data Center GPU Flex 170|         16225M|
+| 1| [level_zero:gpu:1]| 1.3|         Intel Data Center GPU Flex 170|         16225M|
+| 2|     [opencl:gpu:0]| 3.0|         Intel Data Center GPU Flex 170|         16225M|
+| 3|     [opencl:gpu:1]| 3.0|         Intel Data Center GPU Flex 170|         16225M|
+| 4|     [opencl:cpu:0]| 3.0|     Intel Xeon Gold 6346 CPU @ 3.10GHz|        540700M|
+| 5|     [opencl:acc:0]| 1.2|            Intel FPGA Emulation Device|        540700M|
+Part2:
+|ID|Max compute units|Max work group|Max subgroup|                    Driver version|
+|--|-----------------|--------------|------------|----------------------------------|
+| 0|              512|          1024|          32|                         1.3.27642|
+| 1|              512|          1024|          32|                         1.3.27642|
+| 2|              512|          1024|          32|                    23.43.27642.40|
+| 3|              512|          1024|          32|                    23.43.27642.40|
+| 4|               64|          8192|          64|2024.17.5.0.08_160000.xmain-hotfix|
+| 5|               64|      67108864|          64|2024.17.5.0.08_160000.xmain-hotfix|
+
 ```

 | Attribute              | Note                                                        |
@ -477,15 +487,24 @@ build\bin\ls-sycl-device.exe
 The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
 ```
 found 6 SYCL devices:
-|  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
-|ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
-|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
-| 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
-| 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
-| 2|    [opencl:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       3.0|        512|    1024|     32|    16225243136|
-| 3|    [opencl:gpu:1]|                    Intel(R) UHD Graphics 770|       3.0|         32|     512|     32|    53651849216|
-| 4|    [opencl:cpu:0]|         13th Gen Intel(R) Core(TM) i7-13700K|       3.0|         24|    8192|     64|    67064815616|
-| 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|
+Part1:
+|ID|        Device Type| Ver|                                   Name|Global mem size|
+|--|-------------------|----|---------------------------------------|---------------|
+| 0| [level_zero:gpu:0]| 1.3|         Intel Data Center GPU Flex 170|         16225M|
+| 1| [level_zero:gpu:1]| 1.3|         Intel Data Center GPU Flex 170|         16225M|
+| 2|     [opencl:gpu:0]| 3.0|         Intel Data Center GPU Flex 170|         16225M|
+| 3|     [opencl:gpu:1]| 3.0|         Intel Data Center GPU Flex 170|         16225M|
+| 4|     [opencl:cpu:0]| 3.0|     Intel Xeon Gold 6346 CPU @ 3.10GHz|        540700M|
+| 5|     [opencl:acc:0]| 1.2|            Intel FPGA Emulation Device|        540700M|
+Part2:
+|ID|Max compute units|Max work group|Max subgroup|                    Driver version|
+|--|-----------------|--------------|------------|----------------------------------|
+| 0|              512|          1024|          32|                         1.3.27642|
+| 1|              512|          1024|          32|                         1.3.27642|
+| 2|              512|          1024|          32|                    23.43.27642.40|
+| 3|              512|          1024|          32|                    23.43.27642.40|
+| 4|               64|          8192|          64|2024.17.5.0.08_160000.xmain-hotfix|
+| 5|               64|      67108864|          64|2024.17.5.0.08_160000.xmain-hotfix|

 ```

@ -556,6 +575,32 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
 | GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
+| GGML_SYCL_VISIBLE_DEVICES|id1,id2,...|It's like `CUDA_VISIBLE_DEVICES`, define the SYCL device ID list to visible. Like "0", "0,2", "2,1" |
+| ONEAPI_DEVICE_SELECTOR|Refer to [oneapi-device-selector](https://intel.github.io/llvm-docs/EnvironmentVariables.html#oneapi-device-selector)|be used to limit the choice of devices available when the SYCL-using application is run|
+
+##### Choose SYCL Devices in Running Time
+
+In SYCL running time, a physical device could be mapped to two logical devices on different running times: Level-Zero and OpenCL. So it will show more devices in SYCL view. But we need avoid to run code on these two logical devices on same physical device in same time.
+
+The SCYL backend supports dGPU or iGPU in same machine.
+
+##### SYCL Backend Rule:
+
+|Mode|Explain|Example|Recommend Cases|Note|
+|-|-|-|-|-|
+|Normal|Use all powest devices. Default mode. No special setting.<br>SYCL backend will detect and choose the **Level-Zero** devices which have top `Max compute units`.<br> ||Most cases of normal user.||
+|Advanced|Allow user choose one or more SYCL devices which could be Level-Zero or OpenCL or both.<br>Set the device list by environment variable: **GGML_SYCL_VISIBLE_DEVICES**, like `CUDA_VISIBLE_DEVICES`.<br>SYCL backend will choose all devices by it.| `set/export GGML_SYCL_VISIBLE_DEVICES=1`<br>`set/export GGML_SYCL_VISIBLE_DEVICES=0,1`<br>`set/export GGML_SYCL_VISIBLE_DEVICES=2,1`|Use iGPU or both in dGPU + iGPU environment<br>Use a dGPU in mulitple dGPU environment.<br>Use one or more OpenCL devices|There is known issue of OpenCL device. WIP.|
+|Developer|Allow SYCL developer choose one or more SYCL devices by environment varibale **ONEAPI_DEVICE_SELECTOR** with flexiable grammar.<br>Refer to [oneapi-device-selector](https://intel.github.io/llvm-docs/EnvironmentVariables.html#oneapi-device-selector).|`set/export ONEAPI_DEVICE_SELECTOR=level_zero:1`<br>`set/export ONEAPI_DEVICE_SELECTOR=opencl:*`<br>`set/export ONEAPI_DEVICE_SELECTOR=opencl:gpu;level_zero:gpu`<br>|Cover the Advanced mode. It will impact **Normal** and **Advanced** modes as low level principle.<br>Flexiable grammar support more complex device environments.|There is known issue of OpenCL device. WIP.|
+
+##### Parameters of Llama.cpp
+
+The parameters about device choose of llama.cpp works with SYCL backend rule to decide the final result. User could use one or all chosen devices by SYCL backend rule.
+
+|Device|Values|Note|
+|-|-|-|
+|Single Device|`--split-mode=none` and `--main-gpu=id`|The value of `main-gpu` must be in the chosen device lists printed out during llama.cpp startup. Like:<br>`detect 2 SYCL level-zero GPUs:[0,1]`.<br>`main-gpu` should be set to `0` or `1`.|
+|Multiple Device|`--split-mode=layer`|Default|
+

 ## Known Issues

--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@ -2,6 +2,10 @@
 #  Copyright (C) 2024 Intel Corporation
 #  SPDX-License-Identifier: MIT

+add_compile_options(-I${PROJECT_SOURCE_DIR}/ggml)
+add_compile_options(-I${PROJECT_SOURCE_DIR}/ggml/src)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl")
+
 set(TARGET llama-ls-sycl-device)
 add_executable(${TARGET} ls-sycl-device.cpp)
 install(TARGETS ${TARGET} RUNTIME)
--- a/examples/sycl/win-run-llama2.bat
+++ b/examples/sycl/win-run-llama2.bat
@ -6,6 +6,6 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force


-.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
+.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0


--- a/ggml/include/ggml-sycl.h
+++ b/ggml/include/ggml-sycl.h
@ -34,6 +34,10 @@ GGML_API GGML_CALL void   ggml_sycl_get_device_description(int device, char *des
 GGML_API GGML_CALL int   ggml_backend_sycl_get_device_count();
 GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);

+GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
+GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int index);
+GGML_API GGML_CALL void ggml_sycl_set_single_device(int main_gpu_id);
+
 // SYCL doesn't support registering host memory, keep here for reference
 // GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
 // GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@ -39,7 +39,7 @@
 #include "ggml-sycl/backend.hpp"
 #include "ggml-sycl/presets.hpp"

-bool   ggml_sycl_loaded(void);
+
 void   ggml_sycl_free_data(struct ggml_tensor * tensor);
 void   ggml_sycl_copy_to_device(struct ggml_tensor * tensor);
 void   ggml_sycl_set_main_device(int main_device);
@ -48,8 +48,7 @@ void   ggml_sycl_get_device_description(int device, char * description, size_t d
 bool   ggml_backend_is_sycl(ggml_backend_t backend);
 int    ggml_backend_sycl_get_device(ggml_backend_t backend);
 static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer);
-static inline int get_sycl_env(const char *env_name, int default_val);
-static inline int get_work_group_size(const sycl::device& device);
+

 void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
                    const void *ptr_src, size_t size) {
@ -2015,184 +2014,23 @@ static void im2col_sycl(const float *x, T *dst, int IW, int IH,
    }
 }

-
-static bool g_sycl_loaded = false;
-
-bool ggml_sycl_loaded(void) {
-    return g_sycl_loaded;
-}
-
-void print_device_detail(int id, sycl::device &device, std::string device_type) {
-
-    dpct::device_info prop;
-    SYCL_CHECK(CHECK_TRY_ERROR(
-        dpct::get_device_info(prop, device)));
-
-    std::string version;
-    version += std::to_string(prop.get_major_version());
-    version += ".";
-    version += std::to_string(prop.get_minor_version());
-
-    device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
-    std::string name = std::string(prop.get_name());
-    name = std::regex_replace(name, std::regex("\\(R\\)"), "");
-    name = std::regex_replace(name, std::regex("\\(TM\\)"), "");
-
-    auto global_mem_size = prop.get_global_mem_size()/1000000;
-
-    fprintf(stderr, "|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
-            name.c_str(), version.c_str(), prop.get_max_compute_units(),
-            prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
-            global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
-}
-
-void ggml_backend_sycl_print_sycl_devices() {
-    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
-    int device_count = dpct::dev_mgr::instance().device_count();
-    std::map<std::string, size_t> DeviceNums;
-    fprintf(stderr, "found %d SYCL devices:\n", device_count);
-    fprintf(stderr, "|  |                   |                                       |       |Max    |        |Max  |Global |                     |\n");
-    fprintf(stderr, "|  |                   |                                       |       |compute|Max work|sub  |mem    |                     |\n");
-    fprintf(stderr, "|ID|        Device Type|                                   Name|Version|units  |group   |group|size   |       Driver version|\n");
-    fprintf(stderr, "|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|\n");
-    for (int id = 0; id < device_count; ++id) {
-        sycl::device device = dpct::dev_mgr::instance().get_device(id);
-        sycl::backend backend = device.get_backend();
-        std::string backend_type = get_device_backend_and_type(device);
-        int type_id=DeviceNums[backend_type]++;
-        std::stringstream device_type;
-        device_type << "[" <<  backend_type << ":" << std::to_string(type_id) << "]";
-        print_device_detail(id, device, device_type.str());
+inline void check_allow_device_id(const int device_id) {
+    if (ggml_sycl_info().device_count<1) {
+        fprintf(stderr, "%s: not detect any SYCL devices, check GPU driver or unset GGML_SYCL_VISIBLE_DEVICES and ONEAPI_DEVICE_SELECTOR\n", __func__);
+        exit(1);
    }
-}
-
-static inline int get_sycl_env(const char *env_name, int default_val) {
-    char *user_device_string = getenv(env_name);
-    int user_number = default_val;
-
-    unsigned n;
-    if (user_device_string != NULL &&
-        sscanf(user_device_string, " %u", &n) == 1) {
-        user_number = (int)n;
-    } else {
-        user_number = default_val;
+    if (!ggml_sycl_info().is_allowed_device(device_id)) {
+        fprintf(stderr, "%s: device_id:%d is out of range [%s]. To use any SYCL devices, set/export GGML_SYCL_VISIBLE_DEVICES or ONEAPI_DEVICE_SELECTOR\n",
+            __func__, device_id, ggml_sycl_info().devices_list());
+        exit_with_stack_print();
    }
-    return user_number;
-}
-
-static void ggml_check_sycl() try {
-    static bool initialized = false;
-
-    if (!initialized) {
-        fprintf(stderr, "[SYCL] call ggml_check_sycl\n");
-        g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
-
-        fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
-
-#if defined(GGML_SYCL_F16)
-        fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__);
-#else
-        fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
-#endif
-
-/* NOT REMOVE, keep it for next optimize for XMX.
-#if defined(SYCL_USE_XMX)
-        fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
-#else
-        fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
-#endif
-*/
-
-        if (CHECK_TRY_ERROR(g_all_sycl_device_count =
-                            dpct::dev_mgr::instance().device_count()) != 0) {
-            initialized = true;
-            g_sycl_loaded = false;
-            return;
-        }
-        GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
-        ggml_backend_sycl_print_sycl_devices();
-        initialized = true;
-        g_sycl_loaded = true;
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static ggml_sycl_device_info ggml_sycl_init() {
-    ggml_sycl_device_info info = {};
-
-    info.device_count = dpct::dev_mgr::instance().device_count();
-    if (info.device_count == 0) {
-        fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__);
-        return info;
-    }
-
-    GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES);
-
-    int64_t total_vram = 0;
-#if defined(GGML_SYCL_FORCE_MMQ)
-    fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   yes\n", __func__);
-#else
-    fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   no\n", __func__);
-#endif
-#if defined(SYCL_USE_XMX)
-    fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
-#else
-    fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
-#endif
-    fprintf(stderr, "%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count);
-
-    for (int i = 0; i < info.device_count; ++i) {
-        info.devices[i].vmm = 0;
-        dpct::device_info prop;
-        SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
-            prop, dpct::dev_mgr::instance().get_device(i))));
-
-        info.default_tensor_split[i] = total_vram;
-        total_vram += prop.get_global_mem_size();
-
-        info.devices[i].cc =
-            100 * prop.get_major_version() + 10 * prop.get_minor_version();
-    }
-
-    for (int id = 0; id < info.device_count; ++id) {
-        info.default_tensor_split[id] /= total_vram;
-    }
-    return info;
-}
-
-const ggml_sycl_device_info & ggml_sycl_info() {
-    static ggml_sycl_device_info info = ggml_sycl_init();
-    return info;
-}
-
-/*
-device_index: device index from 0 to n (continue numbers).
-    It is used for device select/set in SYCL backend internal data structure.
-*/
-inline void check_allow_gpu_index(const int device_index) {
-  if (device_index >= ggml_sycl_info().device_count) {
-    char error_buf[256];
-    snprintf(
-        error_buf,
-        sizeof(error_buf),
-        "%s error: device_index:%d is out of range: [0-%d]",
-        __func__,
-        device_index,
-        ggml_sycl_info().device_count - 1);
-    fprintf(stderr, "%s\n", error_buf);
-    assert(false);
-  }
 }

 // buffer pool for sycl (legacy)
 struct ggml_sycl_pool_leg : public ggml_sycl_pool {
    static const int MAX_SYCL_BUFFERS = 256;

-    int device;
+    int device_id;
    queue_ptr qptr;
    struct ggml_sycl_buffer {
        void * ptr = nullptr;
@ -2204,7 +2042,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {

    explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) :
        qptr(qptr_),
-        device(device_) {
+        device_id(device_) {
    }

    ~ggml_sycl_pool_leg() {
@ -2288,12 +2126,12 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
    }
 };

-std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
+std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device_id) {
    // TBD: NO VMM support
-    // if (ggml_sycl_info().devices[device].vmm) {
-    //     return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(device));
+    // if (ggml_sycl_info().devices[device_id].vmm) {
+    //     return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(device_id));
    // }
-   return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
+   return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device_id));
 }

 // TBD pool with virtual memory management
@ -2705,12 +2543,13 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
    int64_t min_compute_capability = INT_MAX;
    int64_t max_compute_capability = INT_MIN;
    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int id = ggml_backend_sycl_get_device_id(i);
        if (tensor_split[i] < (i + 1 < ggml_sycl_info().device_count ? tensor_split[i + 1] : 1.0f)) {
-            if (min_compute_capability > ggml_sycl_info().devices[i].cc) {
-                min_compute_capability = ggml_sycl_info().devices[i].cc;
+            if (min_compute_capability > ggml_sycl_info().devices[id].cc) {
+                min_compute_capability = ggml_sycl_info().devices[id].cc;
            }
-            if (max_compute_capability < ggml_sycl_info().devices[i].cc) {
-                max_compute_capability = ggml_sycl_info().devices[i].cc;
+            if (max_compute_capability < ggml_sycl_info().devices[id].cc) {
+                max_compute_capability = ggml_sycl_info().devices[id].cc;
            }
        }
    }
@ -3133,17 +2972,20 @@ static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {

 #ifdef NDEBUG
    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-        SYCL_CHECK(ggml_sycl_set_device(i));
+        int id = ggml_backend_sycl_get_device_id(i);
+        SYCL_CHECK(ggml_sycl_set_device(id));
    }

    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-        SYCL_CHECK(ggml_sycl_set_device(i));
+        int id = ggml_backend_sycl_get_device_id(i);
+        SYCL_CHECK(ggml_sycl_set_device(id));

-        for (int id_other = 0; id_other < ggml_sycl_info().device_count; ++id_other) {
-            if (i == id_other) {
+        for (int i_other = 0; i_other < ggml_sycl_info().device_count; ++i_other) {
+            int id_other = ggml_backend_sycl_get_device_id(i_other);
+            if (id == id_other) {
                continue;
            }
-            if (i != main_device && id_other != main_device) {
+            if (id != main_device && id_other != main_device) {
                continue;
            }

@ -3241,9 +3083,10 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
    queue_ptr main_stream = ctx.stream();

    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int id = ggml_backend_sycl_get_device_id(i);
        // by default, use all rows
-        dev[i].row_low  = 0;
-        dev[i].row_high = ne01;
+        dev[id].row_low  = 0;
+        dev[id].row_high = ne01;

        // for multi GPU, get the row boundaries from tensor split
        // and round to mul_mat_q tile sizes
@ -3251,51 +3094,52 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
            const int64_t rounding = get_row_rounding(src0->type, tensor_split);

            if (i != 0) {
-                dev[i].row_low  = ne01*tensor_split[i];
-                if (dev[i].row_low < ne01) {
-                    dev[i].row_low -= dev[i].row_low % rounding;
+                dev[id].row_low  = ne01*tensor_split[i];
+                if (dev[id].row_low < ne01) {
+                    dev[id].row_low -= dev[id].row_low % rounding;
                }
            }

            if (i != ggml_sycl_info().device_count - 1) {
-                dev[i].row_high  = ne01*tensor_split[i + 1];
-                if (dev[i].row_high < ne01) {
-                    dev[i].row_high -= dev[i].row_high % rounding;
+                dev[id].row_high  = ne01*tensor_split[i + 1];
+                if (dev[id].row_high < ne01) {
+                    dev[id].row_high -= dev[id].row_high % rounding;
                }
            }
        }
    }

    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-        if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) {
+        int id = ggml_backend_sycl_get_device_id(i);
+        if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
            continue;
        }

        used_devices++;

-        const bool src1_on_device = i == ctx.device;
-        const bool  dst_on_device = i == ctx.device;
+        const bool src1_on_device = id == ctx.device;
+        const bool  dst_on_device = id == ctx.device;

-        ggml_sycl_set_device(i);
-        queue_ptr stream = ctx.stream(i, 0);
+        ggml_sycl_set_device(id);
+        queue_ptr stream = ctx.stream(id, 0);

        if (src0_is_contiguous) {
-            dev[i].src0_dd = (char *) src0->data;
+            dev[id].src0_dd = (char *) src0->data;
        } else {
-            dev[i].src0_dd = dev[i].src0_dd_alloc.alloc(ctx.pool(i), ggml_nbytes(src0));
+            dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
        }

        if (src1_on_device && src1_is_contiguous) {
-            dev[i].src1_ddf = (float *) src1->data;
+            dev[id].src1_ddf = (float *) src1->data;
        } else {
-            dev[i].src1_ddf = dev[i].src1_ddf_alloc.alloc(ctx.pool(i), ggml_nelements(src1));
+            dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
        }

        if (convert_src1_to_q8_1) {
-            dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
+            dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);

            if (src1_on_device && src1_is_contiguous) {
-                quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
+                quantize_row_q8_1_sycl(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
                /*
                DPCT1010:90: SYCL uses exceptions to report errors and does not
                use the error codes. The call was replaced with 0. You need to
@ -3306,10 +3150,10 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
        }

        if (dst_on_device) {
-            dev[i].dst_dd = (float *) dst->data;
+            dev[id].dst_dd = (float *) dst->data;
        } else {
-            const size_t size_dst_ddf = split ? (dev[i].row_high - dev[i].row_low)*ne1 : ggml_nelements(dst);
-            dev[i].dst_dd = dev[i].dst_dd_alloc.alloc(ctx.pool(i), size_dst_ddf);
+            const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
+            dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(ctx.pool(id), size_dst_ddf);
        }
    }

@ -3333,19 +3177,20 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
        const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;

        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-            if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) {
+            int id = ggml_backend_sycl_get_device_id(i);
+            if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
                continue;
            }

-            const bool src1_on_device = i == ctx.device;
-            const bool  dst_on_device = i == ctx.device;
-            const int64_t row_diff = dev[i].row_high - dev[i].row_low;
+            const bool src1_on_device = id == ctx.device;
+            const bool  dst_on_device = id == ctx.device;
+            const int64_t row_diff = dev[id].row_high - dev[id].row_low;

-            ggml_sycl_set_device(i);
-            queue_ptr stream = ctx.stream(i, is);
+            ggml_sycl_set_device(id);
+            queue_ptr stream = ctx.stream(id, is);

            // wait for main GPU data if necessary
-            if (split && (i != ctx.device || is != 0)) {
+            if (split && (id != ctx.device || is != 0)) {
                /*
                DPCT1009:163: SYCL uses exceptions to report errors and does not
                use the error codes. The original code was commented out and a
@ -3362,20 +3207,20 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;

                // for split tensors the data begins at i0 == i0_offset_low
-                char  *  src0_dd_i =  dev[i].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
-                float * src1_ddf_i = dev[i].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
-                char  * src1_ddq_i = dev[i].src1_ddq +  src1_ddq_i_offset;
-                float *   dst_dd_i =   dev[i].dst_dd + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);
+                char  *  src0_dd_i =  dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
+                float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
+                char  * src1_ddq_i = dev[id].src1_ddq +  src1_ddq_i_offset;
+                float *   dst_dd_i =   dev[id].dst_dd + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);

                // the main device memory buffer can be on VRAM scratch, with space for all partial results
                // in that case an offset on dst_ddf_i is needed
-                if (i == ctx.device) {
-                    dst_dd_i += dev[i].row_low; // offset is 0 if no tensor split
+                if (id == ctx.device) {
+                    dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
                }

                // copy src0, src1 to device if necessary
                if (src1_is_contiguous) {
-                    if (i != ctx.device) {
+                    if (id != ctx.device) {
                        if (convert_src1_to_q8_1) {
                            char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
                          SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
@ -3410,14 +3255,14 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
                }

                if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
-                    SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[i].row_low, dev[i].row_high, stream));
+                    SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
                }
                if (src1->type == GGML_TYPE_F16) {
                    src1_padded_col_size = (i0 * ne11 + src1_col_0) * ne10;
                }
                // do the computation
                SYCL_CHECK(CHECK_TRY_ERROR(op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
-                    dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream)));
+                    dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream)));
                /*
                DPCT1010:93: SYCL uses exceptions to report errors and does not
                use the error codes. The call was replaced with 0. You need to
@ -3436,7 +3281,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
                        // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
-                        dhf_dst_i += src1_col_0*ne0 + dev[i].row_low;
+                        dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;

                        SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
                            dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
@ -3453,7 +3298,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
                }

                // add event for the main device to wait on until other device is done
-                if (split && (i != ctx.device || is != 0)) {
+                if (split && (id != ctx.device || is != 0)) {
                    /*
                    DPCT1024:94: The original code returned the error code that
                    was further consumed by the program logic. This original
@ -3461,7 +3306,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
                    program logic consuming the error code.
                    */
                    SYCL_CHECK(CHECK_TRY_ERROR(
-                        *src0_extra->events[i][is] =
+                        *src0_extra->events[id][is] =
                            stream->ext_oneapi_submit_barrier()));
                }
            }
@ -3475,13 +3320,14 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten

        ggml_sycl_set_device(ctx.device);
        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-            if (dev[i].row_low == dev[i].row_high) {
+            int id = ggml_backend_sycl_get_device_id(i);
+            if (dev[id].row_low == dev[id].row_high) {
                continue;
            }
            for (int64_t is = 0; is < is_max; ++is) {
                SYCL_CHECK(CHECK_TRY_ERROR(
                    ctx.stream()->ext_oneapi_submit_barrier(
-                        {*src0_extra->events[i][is]})));
+                        {*src0_extra->events[id][is]})));
            }
        }
    }
@ -3891,9 +3737,10 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
    if (split) {
        ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *) src0->buffer->buft->context;
        auto & tensor_split = buft_ctx->tensor_split;
-        for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+            int id = ggml_backend_sycl_get_device_id(i);
            // skip devices that are not going to do any work:
-            if (tensor_split[id] >= (id + 1 < ggml_sycl_info().device_count ? tensor_split[id + 1] : 1.0f)) {
+            if (tensor_split[i] >= (i + 1 < ggml_sycl_info().device_count ? tensor_split[i + 1] : 1.0f)) {
                continue;
            }

@ -4310,17 +4157,17 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
    return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
 }

-void ggml_sycl_set_main_device(const int main_device) try {
-    if (dpct::get_current_device_id() == main_device) return;
-    check_allow_gpu_index(main_device);
-    dpct::select_device(main_device);
+void ggml_sycl_set_main_device(const int main_device_id) try {
+    if (dpct::get_current_device_id() == main_device_id) return;
+    check_allow_device_id(main_device_id);
+    dpct::select_device(main_device_id);

    if (g_ggml_sycl_debug) {
        dpct::device_info prop;
        SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
-            prop, dpct::dev_mgr::instance().get_device(main_device))));
+            prop, dpct::dev_mgr::instance().get_device(main_device_id))));
        fprintf(stderr, "Using device %d (%s) as main device\n",
-                main_device, prop.get_name());
+                main_device_id, prop.get_name());
    }
 }
 catch (sycl::exception const &exc) {
@ -4330,7 +4177,6 @@ catch (sycl::exception const &exc) {
 }

 bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * tensor) {
-    if (!g_sycl_loaded) return false;

    ggml_sycl_func_t func;

@ -4471,13 +4317,19 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
    return true;
 }

+GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int index) {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_id\n");
+    return ggml_sycl_info().get_device_id(index);
+}
+
 GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
    GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
    for(int i=0;i<max_len;i++) id_list[i] = -1;

    for (int i=0;i< ggml_sycl_info().device_count;i++){
        if (i>=max_len) break;
-        id_list[i] = i;
+        int id = ggml_backend_sycl_get_device_id(i);
+        id_list[i] = id;
    }
    return;
 }
@ -4501,12 +4353,12 @@ catch (sycl::exception const &exc) {
  std::exit(1);
 }

-GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
+GGML_API GGML_CALL void ggml_sycl_get_device_description(int device_id, char *description,
                                      size_t description_size) try {
    GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
    dpct::device_info prop;
    SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
-        prop, dpct::dev_mgr::instance().get_device(device))));
+        prop, dpct::dev_mgr::instance().get_device(device_id))));
    snprintf(description, description_size, "%s", prop.get_name());
 }
 catch (sycl::exception const &exc) {
@ -4515,10 +4367,10 @@ catch (sycl::exception const &exc) {
  std::exit(1);
 }

-GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
+GGML_CALL void ggml_backend_sycl_get_device_memory(int device_id, size_t *free,
                                                   size_t *total) try {
    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
-    ggml_sycl_set_device(device);
+    ggml_sycl_set_device(device_id);

    /*
    DPCT1009:218: SYCL uses exceptions to report errors and does not use the
@ -4531,7 +4383,7 @@ GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
    You may need to adjust the code.
    */
    SYCL_CHECK(CHECK_TRY_ERROR(
-        dpct::dev_mgr::instance().get_device(device).get_memory_info(*free, *total)));
+        dpct::dev_mgr::instance().get_device(device_id).get_memory_info(*free, *total)));
 }
 catch (sycl::exception const &exc) {
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@ -4553,9 +4405,9 @@ struct ggml_backend_sycl_buffer_context {
    queue_ptr stream;
    std::string name;

-     ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) :
-        device(device), dev_ptr(dev_ptr), stream(stream) {
-            check_allow_gpu_index(device);
+     ggml_backend_sycl_buffer_context(int device_id, void * dev_ptr, queue_ptr stream) :
+        device(device_id), dev_ptr(dev_ptr), stream(stream) {
+            check_allow_device_id(device);
            name = (GGML_SYCL_NAME + std::to_string(device));
        }

@ -4831,71 +4683,66 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
    /* .is_host          = */ nullptr,
 };

-ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
+ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) {
    static std::mutex mutex;
    std::lock_guard<std::mutex> lock(mutex);

    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");

-    if (device>=ggml_sycl_info().device_count or device<0) {
-        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
-            device, ggml_sycl_info().device_count-1);
-        GGML_ASSERT(device<ggml_sycl_info().device_count);
-    }
+    check_allow_device_id(device_id);
+
    static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];

    static bool ggml_backend_sycl_buffer_type_initialized = false;

    if (!ggml_backend_sycl_buffer_type_initialized) {
        for (int i = 0; i < ggml_sycl_info().device_count; i++) {
-            auto & device_i = dpct::dev_mgr::instance().get_device(i);
-            queue_ptr stream = &(device_i.default_queue());
-            ggml_backend_sycl_buffer_types[i] = {
+            int id = ggml_backend_sycl_get_device_id(i);
+            auto & device = dpct::dev_mgr::instance().get_device(id);
+            queue_ptr stream = &(device.default_queue());
+            ggml_backend_sycl_buffer_types[id] = {
                /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
-                /* .context  = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), stream},
+                /* .context  = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), stream},
            };
        }
        ggml_backend_sycl_buffer_type_initialized = true;
    }
-    return &ggml_backend_sycl_buffer_types[device];
+    return &ggml_backend_sycl_buffer_types[device_id];
 }

 ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");

-    int device = ctx->device;
-    if (device>=ggml_sycl_info().device_count or device<0) {
-        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
-            device, ggml_sycl_info().device_count-1);
-        GGML_ASSERT(device<ggml_sycl_info().device_count);
-    }
+    check_allow_device_id(ctx->device);
+
    static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];

    static bool ggml_backend_sycl_buffer_type_initialized = false;

    if (!ggml_backend_sycl_buffer_type_initialized) {
        for (int i = 0; i < ggml_sycl_info().device_count; i++) {
-            ggml_backend_sycl_buffer_types[i] = {
+            int id = ggml_backend_sycl_get_device_id(i);
+            ggml_backend_sycl_buffer_types[id] = {
                /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
-                /* .context  = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), ctx->stream(i, 0)},
+                /* .context  = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), ctx->stream(id, 0)},
            };
        }
        ggml_backend_sycl_buffer_type_initialized = true;
    }
-    return &ggml_backend_sycl_buffer_types[device];
+    return &ggml_backend_sycl_buffer_types[ctx->device];
 }

 // sycl split buffer type
-static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split, int id) {
+static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split, int i) {
    const int64_t nrows = ggml_nrows(tensor);
    const int64_t rounding = get_row_rounding(tensor->type, tensor_split);

-    *row_low = id == 0 ? 0 : nrows*tensor_split[id];
+    *row_low = i == 0 ? 0 : nrows*tensor_split[i];
    *row_low -= *row_low % rounding;
-    if (id == ggml_sycl_info().device_count - 1) {
+    if (i == ggml_sycl_info().device_count - 1) {
        *row_high = nrows;
    } else {
-        *row_high = nrows*tensor_split[id + 1];
+        *row_high = nrows*tensor_split[i + 1];
        *row_high -= *row_high % rounding;
    }
 }
@ -4904,8 +4751,9 @@ struct ggml_backend_sycl_split_buffer_context {
    ~ggml_backend_sycl_split_buffer_context() try {
        for (ggml_tensor_extra_gpu * extra : tensor_extras) {
            for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+                int id = ggml_backend_sycl_get_device_id(i);
                for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
-                    if (extra->events[i][is] != nullptr) {
+                    if (extra->events[id][is] != nullptr) {
                        /*
                        DPCT1009:206: SYCL uses exceptions to report errors and
                        does not use the error codes. The original code was
@ -4913,19 +4761,19 @@ struct ggml_backend_sycl_split_buffer_context {
                        need to rewrite this code.
                        */
                        SYCL_CHECK(CHECK_TRY_ERROR(
-                            dpct::destroy_event(extra->events[i][is])));
+                            dpct::destroy_event(extra->events[id][is])));
                    }
                }
-                if (extra->data_device[i] != nullptr) {
+                if (extra->data_device[id] != nullptr) {
                    /*
                    DPCT1009:207: SYCL uses exceptions to report errors and does
                    not use the error codes. The original code was commented out
                    and a warning string was inserted. You need to rewrite this
                    code.
                    */
-                    ggml_sycl_set_device(i);
+                    ggml_sycl_set_device(id);
                    SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(
-                        extra->data_device[i], *(streams[i]))));
+                        extra->data_device[id], *(streams[id]))));
                }
            }
            delete extra;
@ -4979,6 +4827,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
        ctx->streams.push_back(&(dpct::get_current_device().default_queue()));

    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int id = ggml_backend_sycl_get_device_id(i);
        int64_t row_low, row_high;
        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);

@ -4997,8 +4846,8 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,

        // FIXME: do not crash if cudaMalloc fails
        // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
-        ggml_sycl_set_device(i);
-        const queue_ptr stream = ctx->streams[i];
+        ggml_sycl_set_device(id);
+        const queue_ptr stream = ctx->streams[id];
        char * buf;
        /*
        DPCT1009:208: SYCL uses exceptions to report errors and does not use the
@ -5021,7 +4870,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
                    .wait()));
        }

-        extra->data_device[i] = buf;
+        extra->data_device[id] = buf;

        for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
            /*
@ -5030,7 +4879,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
            string was inserted. You need to rewrite this code.
            */
            SYCL_CHECK(
-                CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event()));
+                CHECK_TRY_ERROR(extra->events[id][is] = new sycl::event()));
        }
    }
    tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT;
@ -5058,6 +4907,7 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;

    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int id = ggml_backend_sycl_get_device_id(i);
        int64_t row_low, row_high;
        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);

@ -5081,11 +4931,11 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
        error codes. The original code was commented out and a warning string
        was inserted. You need to rewrite this code.
        */
-        ggml_sycl_set_device(i);
-        const queue_ptr stream = ctx->streams[i];
+        ggml_sycl_set_device(id);
+        const queue_ptr stream = ctx->streams[id];
        SYCL_CHECK(CHECK_TRY_ERROR(
            (*stream)
-                .memcpy(extra->data_device[i], buf_host, original_size)
+                .memcpy(extra->data_device[id], buf_host, original_size)
                .wait()));
    }
 }
@ -5111,6 +4961,7 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;

    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int id = ggml_backend_sycl_get_device_id(i);
        int64_t row_low, row_high;
        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);

@ -5134,11 +4985,11 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
        error codes. The original code was commented out and a warning string
        was inserted. You need to rewrite this code.
        */
-        ggml_sycl_set_device(i);
-        const queue_ptr stream = ctx->streams[i];
+        ggml_sycl_set_device(id);
+        const queue_ptr stream = ctx->streams[id];
        SYCL_CHECK(CHECK_TRY_ERROR(
            (*stream)
-                .memcpy(buf_host, extra->data_device[i], original_size)
+                .memcpy(buf_host, extra->data_device[id], original_size)
                .wait()));
    }
 }
@ -5233,7 +5084,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const f
    std::lock_guard<std::mutex> lock(mutex);

    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n");
-    ggml_check_sycl();
+
    // FIXME: this is not thread safe
    static std::map<std::array<float, GGML_SYCL_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;

@ -5350,6 +5201,7 @@ GGML_CALL static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,

    GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
    const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
+
    SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
        (char *)tensor->data + offset, data, size).wait()));
 }
@ -5613,24 +5465,19 @@ static ggml_guid_t ggml_backend_sycl_guid() {
    return &guid;
 }

-GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
-    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
-    ggml_check_sycl();
-
-    check_allow_gpu_index(device);
-
-    ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context(device);
+GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device_id) {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init, device_id=%d\n", device_id);
+    check_allow_device_id(device_id);
+    ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context(ggml_sycl_info(), device_id);
    if (ctx == nullptr) {
        fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
        return nullptr;
    };
-
    ggml_backend_t sycl_backend = new ggml_backend {
        /* .guid      = */ ggml_backend_sycl_guid(),
        /* .interface = */ ggml_backend_sycl_interface,
        /* .context   = */ ctx
    };
-
    return sycl_backend;
 }

@ -5655,9 +5502,10 @@ extern "C" int ggml_backend_sycl_reg_devices();
 int ggml_backend_sycl_reg_devices() {
    assert(ggml_sycl_info().device_count>0);
    for (int i = 0; i < ggml_sycl_info().device_count; i++) {
+        int id = ggml_backend_sycl_get_device_id(i);
        char name[128];
-        snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, i);
-        ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(i), (void *) (intptr_t) i);
+        snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, id);
+        ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(id), (void *) (intptr_t) id);
    }
    return ggml_sycl_info().device_count;
 }
--- a/ggml/src/ggml-sycl/common.cpp
+++ b/ggml/src/ggml-sycl/common.cpp
@ -51,3 +51,451 @@ void ggml_sycl_host_free(void* ptr) try {
            << ", line:" << __LINE__ << std::endl;
  std::exit(1);
 }
+
+static inline int get_sycl_env(const char *env_name, int default_val) {
+    char *user_device_string = getenv(env_name);
+    int user_number = default_val;
+
+    unsigned n;
+    if (user_device_string != NULL &&
+        sscanf(user_device_string, " %u", &n) == 1) {
+        user_number = (int)n;
+    } else {
+        user_number = default_val;
+    }
+    return user_number;
+}
+
+static inline bool env_existed(const char *env_name) {
+     char *user_device_string = getenv(env_name);
+     return user_device_string!=NULL;
+}
+
+static std::vector<int> get_sycl_visible_devices() {
+    static std::vector<int> device_ids;
+    char *devices_env = getenv("GGML_SYCL_VISIBLE_DEVICES");
+    if (devices_env != nullptr) {
+        std::string devices(devices_env);
+        std::replace(devices.begin(), devices.end(), ',', ' ');
+
+        std::stringstream ss(devices);
+        int tmp;
+        while (ss >> tmp) {
+            device_ids.push_back(tmp);
+        }
+    }
+    return device_ids;
+}
+
+void print_device_detail_part1(int id, sycl::device &device, std::string device_type) {
+
+    dpct::device_info prop;
+    SYCL_CHECK(CHECK_TRY_ERROR(
+        dpct::get_device_info(prop, device)));
+
+    std::string version;
+    version += std::to_string(prop.get_major_version());
+    version += ".";
+    version += std::to_string(prop.get_minor_version());
+
+    device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
+    std::string name = std::string(prop.get_name());
+    name = std::regex_replace(name, std::regex("\\(R\\)"), "");
+    name = std::regex_replace(name, std::regex("\\(TM\\)"), "");
+
+    auto global_mem_size = prop.get_global_mem_size()/1000000;
+
+    fprintf(stderr, "|%2d|%19s|%4s|%39s|%14luM|\n", id, device_type.c_str(), version.c_str(),
+        name.c_str(), global_mem_size);
+}
+
+void print_device_detail_part2(int id, sycl::device &device, std::string device_type) {
+
+    dpct::device_info prop;
+    SYCL_CHECK(CHECK_TRY_ERROR(
+        dpct::get_device_info(prop, device)));
+
+    fprintf(stderr, "|%2d|%17d|%14d|%12d|%34s|\n", id,
+        prop.get_max_compute_units(),
+        prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
+        device.get_info<sycl::info::device::driver_version>().c_str());
+}
+
+void ggml_backend_sycl_print_sycl_devices() {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
+    int device_count = dpct::dev_mgr::instance().device_count();
+    std::map<std::string, size_t> DeviceNums;
+    fprintf(stderr, "found %d SYCL devices:\n", device_count);
+    fprintf(stderr, "Part1:\n");
+    fprintf(stderr, "|ID|        Device Type| Ver|                                   Name|Global mem size|\n");
+    fprintf(stderr, "|--|-------------------|----|---------------------------------------|---------------|\n");
+    for (int id = 0; id < device_count; ++id) {
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        sycl::backend backend = device.get_backend();
+        std::string backend_type = get_device_backend_and_type(device);
+        int type_id=DeviceNums[backend_type]++;
+        std::stringstream device_type;
+        device_type << "[" <<  backend_type << ":" << std::to_string(type_id) << "]";
+        print_device_detail_part1(id, device, device_type.str());
+    }
+
+    std::map<std::string, size_t> DeviceNums2;
+    fprintf(stderr, "\nPart2:\n");
+    fprintf(stderr, "|ID|Max compute units|Max work group|Max subgroup|                    Driver version|\n");
+    fprintf(stderr, "|--|-----------------|--------------|------------|----------------------------------|\n");
+    for (int id = 0; id < device_count; ++id) {
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        sycl::backend backend = device.get_backend();
+        std::string backend_type = get_device_backend_and_type(device);
+        int type_id=DeviceNums2[backend_type]++;
+        std::stringstream device_type;
+        device_type << "[" <<  backend_type << ":" << std::to_string(type_id) << "]";
+        print_device_detail_part2(id, device, device_type.str());
+    }
+}
+
+static ggml_sycl_device_info ggml_sycl_init() try {
+    static bool initialized = false;
+
+    if (!initialized) {
+        fprintf(stderr, "[SYCL] call ggml_init_sycl\n");
+
+        g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
+        fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__,
+                g_ggml_sycl_debug);
+
+#if defined(GGML_SYCL_F16)
+        fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__);
+#else
+        fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
+#endif
+
+#if defined(GGML_SYCL_FORCE_MMQ)
+        fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   yes\n", __func__);
+#else
+        fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   no\n", __func__);
+#endif
+
+#if defined(SYCL_USE_XMX)
+        fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
+#else
+        fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
+#endif
+
+        if (CHECK_TRY_ERROR(g_all_sycl_device_count =
+                                dpct::dev_mgr::instance().device_count()) !=
+            0) {
+            initialized = true;
+            return;
+        }
+        GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
+        ggml_backend_sycl_print_sycl_devices();
+        initialized = true;
+    }
+
+    static ggml_sycl_device_info info = {};
+    info.refresh_device();
+
+    if (info.device_count == 0) {
+        fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": no available device found\n",
+                __func__);
+        return info;
+    }
+    GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES);
+
+    return info;
+} catch (sycl::exception const &exc) {
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+              << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
+}
+
+ggml_sycl_device_info &ggml_sycl_info() {
+    static ggml_sycl_device_info info = ggml_sycl_init();
+    return info;
+}
+
+//--sycl_device_mgr--
+
+sycl_device_mgr::sycl_device_mgr(
+    ggml_sycl_backend_device_filter device_filter) {
+    switch (device_filter) {
+    case SYCL_DEVICES_TOP_LEVEL_ZERO:
+        detect_sycl_gpu_list_with_max_cu();
+        create_context_for_group_gpus();
+        break;
+    case SYCL_ALL_DEVICES:
+        detect_all_sycl_device_list();
+        create_context_for_devices();
+        break;
+    case SYCL_VISIBLE_DEVICES:
+        detect_sycl_visible_device_list();
+        create_context_for_devices();
+        break;
+    default:
+        std::cerr << "sycl_device_mgr: Invalid device_filter " << device_filter
+                  << std::endl;
+    }
+    init_allow_devices();
+}
+
+/*
+Bind all gpus in same host with same context, for better performance in
+device-to-device copy in the future.
+*/
+void sycl_device_mgr::create_context_for_group_gpus() {
+    sycl::context ctx = sycl::context(devices);
+    assert(device_ids.size() > 0);
+    first_queue = _create_queue_ptr(devices[0]);
+    sycl::context ctx0 = first_queue->get_context();
+    for (int i = 0; i < device_ids.size(); i++) {
+        ctxs.push_back(ctx0);
+    }
+}
+
+sycl::queue *sycl_device_mgr::_create_queue_ptr(sycl::device device) {
+    auto q = dpct::get_current_device().create_queue(device);
+    return q;
+    // _queues.push_back(q);
+    // return & _queues.back();
+}
+
+sycl::queue *sycl_device_mgr::create_queue_for_device(sycl::device &device) {
+    dpct::select_device(dpct::dev_mgr::instance().get_device_id(device));
+    auto qptr = _create_queue_ptr(device);
+    return qptr;
+}
+
+sycl::queue *sycl_device_mgr::create_queue_for_device_id(int device_id) {
+    int i = get_device_index(device_id);
+    sycl::device device = dpct::dev_mgr::instance().get_device(device_id);
+    return create_queue_for_device(device);
+}
+
+int sycl_device_mgr::get_device_index(int device_id) {
+    for (int i = 0; i < device_ids.size(); i++) {
+        if (device_ids[i] == device_id)
+            return i;
+    }
+    return -1;
+}
+
+void sycl_device_mgr::create_context_for_devices() {
+    for (int i = 0; i < device_ids.size(); i++) {
+        sycl::context ctx = sycl::context(devices[i]);
+        ctxs.push_back(ctx);
+    }
+}
+
+void sycl_device_mgr::init_allow_devices() {
+    device_list = "";
+    for (size_t i = 0; i < device_ids.size(); ++i) {
+        device_list += std::to_string(device_ids[i]);
+        device_list += ",";
+    }
+    if (device_list.length() > 1) {
+        device_list.pop_back();
+    }
+}
+
+bool sycl_device_mgr::is_allowed_device(int device_id) {
+    return std::find(device_ids.begin(), device_ids.end(), device_id) !=
+           device_ids.end();
+}
+
+void sycl_device_mgr::detect_all_sycl_device_list() try {
+    int device_count = dpct::dev_mgr::instance().device_count();
+
+    for (int id = 0; id < device_count; id++) {
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        device_ids.push_back(id);
+        devices.push_back(device);
+        dpct::device_info prop;
+        dpct::get_device_info(prop, device);
+        work_group_sizes.push_back(prop.get_max_work_group_size());
+        max_compute_units.push_back(prop.get_max_compute_units());
+    }
+    return;
+} catch (sycl::exception const &exc) {
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+              << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
+}
+
+void sycl_device_mgr::detect_sycl_visible_device_list() try {
+    std::vector<int> sycl_devices = get_sycl_visible_devices();
+    int device_count = dpct::dev_mgr::instance().device_count();
+
+    for (int i = 0; i < sycl_devices.size(); i++) {
+        int id = sycl_devices[i];
+        if (id >= device_count) {
+            std::cerr << __func__ << ": invalid device_id:" << id
+                      << " from GGML_SYCL_VISIBLE_DEVICES="
+                      << getenv("GGML_SYCL_VISIBLE_DEVICES")
+                      << ", available IDs: ";
+            if (device_count > 1) {
+                std::cerr << "[0, " << device_count - 1 << "]";
+            } else if (device_count == 1) {
+                std::cerr << "[0]";
+            } else {
+                std::cerr << "[]";
+            }
+            std::cerr << std::endl;
+        }
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        device_ids.push_back(id);
+        devices.push_back(device);
+        dpct::device_info prop;
+        dpct::get_device_info(prop, device);
+        work_group_sizes.push_back(prop.get_max_work_group_size());
+        max_compute_units.push_back(prop.get_max_compute_units());
+    }
+    return;
+} catch (sycl::exception const &exc) {
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+              << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
+}
+
+/*
+Use all GPUs with same top max compute units
+*/
+void sycl_device_mgr::detect_sycl_gpu_list_with_max_cu() try {
+    int device_count = dpct::dev_mgr::instance().device_count();
+    int local_max_compute_units = 0;
+    for (int id = 0; id < device_count; id++) {
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        if (!device.is_gpu())
+            continue;
+        dpct::device_info prop;
+        dpct::get_device_info(prop, device);
+        if (local_max_compute_units < prop.get_max_compute_units())
+            local_max_compute_units = prop.get_max_compute_units();
+    }
+
+    for (int id = 0; id < device_count; id++) {
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        if (!device.is_gpu())
+            continue;
+        dpct::device_info prop;
+        dpct::get_device_info(prop, device);
+        if (local_max_compute_units == prop.get_max_compute_units() &&
+            is_ext_oneapi_device(device)) {
+            device_ids.push_back(id);
+            devices.push_back(device);
+            work_group_sizes.push_back(prop.get_max_work_group_size());
+            max_compute_units.push_back(prop.get_max_compute_units());
+        }
+    }
+    return;
+} catch (sycl::exception const &exc) {
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+              << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
+}
+
+int sycl_device_mgr::get_device_count() { return (int)device_ids.size(); }
+
+bool sycl_device_mgr::is_ext_oneapi_device(const sycl::device &dev) {
+    sycl::backend dev_backend = dev.get_backend();
+    if (dev_backend == sycl::backend::ext_oneapi_level_zero ||
+        dev_backend == sycl::backend::ext_oneapi_cuda ||
+        dev_backend == sycl::backend::ext_oneapi_hip)
+        return true;
+    return false;
+}
+//--sycl_device_mgr--
+
+//--ggml_sycl_device_info--
+void ggml_sycl_device_info::print_gpu_device_list() {
+    GGML_ASSERT(device_mgr);
+
+    char *hint = NULL;
+    if (oneapi_device_selector_existed && sycl_visible_devices_existed) {
+        hint = "detect %d SYCL devices:[%s] by ONEAPI_DEVICE_SELECTOR=%s and "
+               "GGML_SYCL_VISIBLE_DEVICES=%s\n";
+        fprintf(stderr, hint, device_mgr->get_device_count(), devices_list(),
+                getenv("ONEAPI_DEVICE_SELECTOR"),
+                getenv("GGML_SYCL_VISIBLE_DEVICES"));
+    } else if (oneapi_device_selector_existed) {
+        hint = "detect %d SYCL devices:[%s] by ONEAPI_DEVICE_SELECTOR=%s\n";
+        fprintf(stderr, hint, device_mgr->get_device_count(), devices_list(),
+                getenv("ONEAPI_DEVICE_SELECTOR"));
+    } else if (sycl_visible_devices_existed) {
+        hint = "detect %d SYCL devices:[%s] by GGML_SYCL_VISIBLE_DEVICES=%s\n";
+        fprintf(stderr, hint, device_mgr->get_device_count(), devices_list(),
+                getenv("GGML_SYCL_VISIBLE_DEVICES"));
+    } else {
+        hint = "detect %d SYCL level-zero GPUs:[%s] with top Max compute "
+               "units:%d, to use any SYCL devices, set/export "
+               "GGML_SYCL_VISIBLE_DEVICES or ONEAPI_DEVICE_SELECTOR\n";
+        fprintf(stderr, hint, device_mgr->get_device_count(), devices_list(),
+                device_mgr->max_compute_units[0]);
+    }
+}
+
+int ggml_sycl_device_info::work_group_size(int device_id) {
+    GGML_ASSERT(device_mgr);
+    return device_mgr->work_group_sizes[device_id];
+}
+
+void ggml_sycl_device_info::refresh_device() {
+    oneapi_device_selector_existed = env_existed("ONEAPI_DEVICE_SELECTOR");
+    sycl_visible_devices_existed = env_existed("GGML_SYCL_VISIBLE_DEVICES");
+    if (!device_mgr)
+        delete device_mgr;
+
+    if (sycl_visible_devices_existed) {
+        device_mgr = new sycl_device_mgr(SYCL_VISIBLE_DEVICES);
+    } else if (oneapi_device_selector_existed) {
+        device_mgr = new sycl_device_mgr(SYCL_ALL_DEVICES);
+    } else {
+        device_mgr = new sycl_device_mgr(SYCL_DEVICES_TOP_LEVEL_ZERO);
+    }
+
+    device_count = device_mgr->get_device_count();
+
+    int64_t total_vram = 0;
+
+    for (int i = 0; i < device_count; ++i) {
+        int id = get_device_id(i);
+        devices[id].vmm = 0;
+        dpct::device_info prop;
+        SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
+            prop, dpct::dev_mgr::instance().get_device(id))));
+
+        default_tensor_split[i] =
+            total_vram; // continue data, so use device index
+        total_vram += prop.get_global_mem_size();
+
+        devices[id].cc =
+            100 * prop.get_major_version() + 10 * prop.get_minor_version();
+    }
+
+    for (int i = 0; i < device_count; ++i) {
+        default_tensor_split[i] /=
+            total_vram; // continue data, so use device index
+    }
+
+    print_gpu_device_list();
+}
+
+bool ggml_sycl_device_info::is_allowed_device(int device_id) {
+    return device_mgr->is_allowed_device(device_id);
+}
+
+const char *ggml_sycl_device_info::devices_list() {
+    return device_mgr->device_list.c_str();
+}
+
+int ggml_sycl_device_info::get_device_id(int device_index) {
+    if (device_index < device_mgr->device_ids.size()) {
+        return device_mgr->device_ids.at(device_index);
+    } else {
+        std::cerr << __func__ << ":SYCL device:" << device_index
+                  << " is out of range:[" << devices_list() << "]" << std::endl;
+        std::exit(1);
+    }
+}
+
+//--ggml_sycl_device_info--
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@ -15,6 +15,7 @@

 #include <fstream>
 #include <iostream>
+#include <regex>

 #include "dpct/helper.hpp"
 #include "ggml-sycl.h"
@ -47,11 +48,6 @@ static int g_ggml_sycl_debug = 0;
    }                                                                    \
  }()

-// #define DEBUG_SYCL_MALLOC
-
-static int g_work_group_size = 0;
-// typedef sycl::half ggml_fp16_t;
-
 #define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
 #define VER_4VEC 610 // todo for hardward optimize.
 #define VER_GEN9 700 // todo for hardward optimize.
@ -89,6 +85,12 @@ enum ggml_sycl_backend_gpu_mode {
  SYCL_MUL_GPU_MODE
 };

+enum ggml_sycl_backend_device_filter {
+  SYCL_ALL_DEVICES = 0,
+  SYCL_DEVICES_TOP_LEVEL_ZERO,
+  SYCL_VISIBLE_DEVICES
+};
+
 static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");

 static void crash() {
@ -107,6 +109,8 @@ static void crash() {
  GGML_ASSERT(!"SYCL error");
 }

+#define SYCL_RETURN_ERROR 1
+
 #define SYCL_CHECK(err)                     \
  do {                                      \
    auto err_ = (err);                      \
@ -119,6 +123,7 @@ static void crash() {
          "Meet error in this line code!"); \
  } while (0)

+
 #if DPCT_COMPAT_RT_VERSION >= 11100
 #define GGML_SYCL_ASSUME(x) __builtin_assume(x)
 #else
@ -147,6 +152,8 @@ static void* g_scratch_buffer = nullptr;
 static size_t g_scratch_size = 0; // disabled by default
 static size_t g_scratch_offset = 0;

+int get_current_device_id();
+
 [[noreturn]] static inline void bad_arch(const sycl::stream& stream_ct1) {
  stream_ct1 << "ERROR: ggml-sycl was compiled without support for the "
                "current GPU architecture.\n";
@ -156,20 +163,18 @@ static size_t g_scratch_offset = 0;
  (void)bad_arch; // suppress unused function warning
 }

-int get_current_device_id();
-
-inline dpct::err0 ggml_sycl_set_device(const int device) try {
+inline dpct::err0 ggml_sycl_set_device(const int device_id) try {

  int current_device_id;
  SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id()));

-  // GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d,
-  // current_device_id=%d\n", device, current_device);
-  if (device == current_device_id) {
+  GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d, current_device_id=%d\n", device_id, current_device_id);
+  if (device_id == current_device_id) {
    return 0;
  }

-  return CHECK_TRY_ERROR(dpct::select_device(device));
+  return CHECK_TRY_ERROR(dpct::select_device(device_id));
+
 } catch (sycl::exception const& exc) {
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
            << ", line:" << __LINE__ << std::endl;
@ -177,10 +182,39 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
  std::exit(1);
 }

-//////////////////////
+class sycl_device_mgr {
+  public:
+    std::vector<int> device_ids;
+    std::vector<sycl::device> devices;
+    std::vector<int> max_compute_units;
+    std::vector<int> work_group_sizes;
+    sycl::queue *first_queue;
+    std::vector<sycl::queue> _queues;
+    std::vector<sycl::context> ctxs;
+    std::string device_list = "";
+
+    sycl_device_mgr(ggml_sycl_backend_device_filter device_filter);
+
+    sycl::queue *_create_queue_ptr(sycl::device device); //internal API to hide dpct API.
+    void create_context_for_group_gpus();
+    sycl::queue *create_queue_for_device(sycl::device &device);
+    sycl::queue *create_queue_for_device_id(int device_id);
+    int get_device_index(int device_id);
+    void create_context_for_devices();
+    void init_allow_devices();
+    bool is_allowed_device(int device_id);
+    void detect_all_sycl_device_list();
+    void detect_sycl_visible_device_list();
+    void detect_sycl_gpu_list_with_max_cu();
+    int get_device_count();
+    bool is_ext_oneapi_device(const sycl::device &dev);
+};
+

 struct ggml_sycl_device_info {
    int device_count;
+    bool oneapi_device_selector_existed = false;
+    bool sycl_visible_devices_existed = false;

    struct sycl_device_info {
        int     cc;                 // compute capability
@ -193,9 +227,16 @@ struct ggml_sycl_device_info {
    sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};

    std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
-};

-const ggml_sycl_device_info & ggml_sycl_info();
+    sycl_device_mgr *device_mgr = NULL;
+
+    void print_gpu_device_list();
+    int work_group_size(int device_id);
+    void refresh_device();
+    bool is_allowed_device(int device_id);
+    const char* devices_list();
+    int get_device_id(int device_index);
+};

 struct ggml_sycl_pool {
    virtual ~ggml_sycl_pool() = default;
@ -262,15 +303,16 @@ struct ggml_backend_sycl_context {

    queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };

-    explicit ggml_backend_sycl_context(int device) :
-        device(device),
+    explicit ggml_backend_sycl_context(struct ggml_sycl_device_info &sycl_device_info, int device_id) :
+        device(device_id),
        name(GGML_SYCL_NAME + std::to_string(device)) {
+            for (int i=0;i<GGML_SYCL_MAX_STREAMS; i++){
+                qptrs[device_id][i] = sycl_device_info.device_mgr->create_queue_for_device_id(device_id);
+            }
    }

    queue_ptr stream(int device, int stream) {
-        if (qptrs[device][stream] == nullptr) {
-            qptrs[device][stream] = &(dpct::get_current_device().default_queue());
-        }
+        assert(qptrs[device][stream] != nullptr);
        return qptrs[device][stream];
    }

@ -295,6 +337,20 @@ struct ggml_backend_sycl_context {
    }
 };

+static inline void exit_with_stack_print() {
+    SYCL_CHECK(SYCL_RETURN_ERROR);
+}
+
+
+static inline int get_sycl_env(const char *env_name, int default_val);
+static inline bool env_existed(const char *env_name);
+void* ggml_sycl_host_malloc(size_t size);
+void ggml_sycl_host_free(void* ptr);
+static std::vector<int> get_sycl_visible_devices();
+void ggml_backend_sycl_print_sycl_devices();
+static ggml_sycl_device_info ggml_sycl_init();
+ggml_sycl_device_info &ggml_sycl_info();
+
 // common host functions

 static inline int get_work_group_size(const sycl::device& device) {
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@ -588,7 +588,7 @@ namespace dpct
        out = prop;
    }

-   /// dpct device extension
+    /// dpct device extension
    class device_ext : public sycl::device {
      typedef std::mutex mutex_type;

@ -687,119 +687,131 @@ namespace dpct
        init_queues();
      }

-      sycl::queue &in_order_queue() { return _q_in_order; }
+      sycl::queue &in_order_queue() { return *_q_in_order; }

-      sycl::queue &out_of_order_queue() { return _q_out_of_order; }
+      sycl::queue &out_of_order_queue() { return *_q_out_of_order; }

      sycl::queue &default_queue() { return in_order_queue(); }

      void queues_wait_and_throw() {
        std::unique_lock<mutex_type> lock(m_mutex);
+        std::vector<std::shared_ptr<sycl::queue>> current_queues(
+            _queues);
        lock.unlock();
-        for (auto &q : _queues) {
-          q.wait_and_throw();
+        for (const auto &q : current_queues)
+        {
+            q->wait_and_throw();
        }
        // Guard the destruct of current_queues to make sure the ref count is
        // safe.
        lock.lock();
      }

-      sycl::queue create_queue(bool enable_exception_handler = false) {
+      sycl::queue *create_queue(bool enable_exception_handler = false) {
        return create_in_order_queue(enable_exception_handler);
      }

-      sycl::queue create_queue(sycl::device device,
+      sycl::queue *create_queue(sycl::device device,
                               bool enable_exception_handler = false) {
        return create_in_order_queue(device, enable_exception_handler);
      }

-      sycl::queue create_in_order_queue(bool enable_exception_handler = false) {
+      sycl::queue *create_in_order_queue(bool enable_exception_handler = false) {
        std::lock_guard<mutex_type> lock(m_mutex);
        return create_queue_impl(enable_exception_handler,
                                 sycl::property::queue::in_order());
      }

-      sycl::queue create_in_order_queue(sycl::device device,
+      sycl::queue *create_in_order_queue(sycl::device device,
                                        bool enable_exception_handler = false) {
        std::lock_guard<mutex_type> lock(m_mutex);
        return create_queue_impl(device, enable_exception_handler,
                                 sycl::property::queue::in_order());
      }

-      sycl::queue create_out_of_order_queue(
+      sycl::queue *create_out_of_order_queue(
          bool enable_exception_handler = false) {
        std::lock_guard<mutex_type> lock(m_mutex);
        return create_queue_impl(enable_exception_handler);
      }

-      void destroy_queue(sycl::queue queue) {
+      void destroy_queue(sycl::queue *&queue) {
        std::lock_guard<mutex_type> lock(m_mutex);
-        _queues.clear();
+        _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
+                                    [=](const std::shared_ptr<sycl::queue> &q) -> bool
+                                    {
+                                        return q.get() == queue;
+                                    }),
+                    _queues.end());
+        queue = nullptr;
      }
-      void set_saved_queue(sycl::queue q) {
+      void set_saved_queue(sycl::queue *q) {
        std::lock_guard<mutex_type> lock(m_mutex);
        _saved_queue = q;
      }
-      sycl::queue get_saved_queue() const {
+      sycl::queue *get_saved_queue() const {
        std::lock_guard<mutex_type> lock(m_mutex);
        return _saved_queue;
      }

     private:
-      void clear_queues() { _queues.clear(); }
+      void clear_queues() {
+        _queues.clear();
+        _q_in_order = _q_out_of_order = _saved_queue = nullptr;
+      }

      void init_queues() {
        _q_in_order =
            create_queue_impl(true, sycl::property::queue::in_order());
        _q_out_of_order = create_queue_impl(true);
-        _saved_queue = default_queue();
+        _saved_queue = &default_queue();
      }

      /// Caller should acquire resource \p m_mutex before calling this
      /// function.
      template <class... Properties>
-      sycl::queue create_queue_impl(bool enable_exception_handler,
+      sycl::queue *create_queue_impl(bool enable_exception_handler,
                                    Properties... properties) {
        sycl::async_handler eh = {};
        if (enable_exception_handler) {
          eh = exception_handler;
        }
-        auto q = sycl::queue(*this, eh,
-                             sycl::property_list(
+        _queues.push_back(std::make_shared<sycl::queue>(
+            *this, eh,
+            sycl::property_list(
 #ifdef DPCT_PROFILING_ENABLED
-                                 sycl::property::queue::enable_profiling(),
+                sycl::property::queue::enable_profiling(),
 #endif
-                                 properties...));
-        _queues.push_back(q);
+                properties...)));

-        return _queues.back();
+        return _queues.back().get();
      }

      template <class... Properties>
-      sycl::queue create_queue_impl(sycl::device device,
+      sycl::queue *create_queue_impl(sycl::device device,
                                    bool enable_exception_handler,
                                    Properties... properties) {
        sycl::async_handler eh = {};
        if (enable_exception_handler) {
          eh = exception_handler;
        }
-        _queues.push_back(
-            sycl::queue(device, eh,
+        _queues.push_back(std::make_shared<sycl::queue>(
+            device, eh,
                        sycl::property_list(
 #ifdef DPCT_PROFILING_ENABLED
                            sycl::property::queue::enable_profiling(),
 #endif
                            properties...)));

-        return _queues.back();
+        return _queues.back().get();
      }

      void get_version(int &major, int &minor) const {
        detail::get_version(*this, major, minor);
      }
-      sycl::queue _q_in_order, _q_out_of_order;
-      sycl::queue _saved_queue;
-      std::vector<sycl::queue> _queues;
+      sycl::queue *_q_in_order, *_q_out_of_order;
+      sycl::queue *_saved_queue;
+      std::vector<std::shared_ptr<sycl::queue>> _queues;
      mutable mutex_type m_mutex;
    };

@ -855,15 +867,15 @@ namespace dpct
        unsigned int get_device_id(const sycl::device &dev)
        {
            unsigned int id = 0;
-            for (auto dev_item : _devs)
+            for (auto &dev_item : _devs)
            {
                if (*dev_item == dev)
                {
-                    break;
+                    return id;
                }
                id++;
            }
-            return id;
+            return -1;
        }

        template <class DeviceSelector>
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -2704,7 +2704,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
 #elif defined(GGML_USE_VULKAN)
    buft = ggml_backend_vk_buffer_type(gpu);
 #elif defined(GGML_USE_SYCL)
-    buft = ggml_backend_sycl_buffer_type(gpu);
+    int gpu_id = ggml_backend_sycl_get_device_id(gpu);
+    buft = ggml_backend_sycl_buffer_type(gpu_id);
 #elif defined(GGML_USE_KOMPUTE)
    buft = ggml_backend_kompute_buffer_type(gpu);
    if (buft == nullptr) {
@ -17629,11 +17630,10 @@ struct llama_context * llama_new_context_with_model(
        } else {
            // LLAMA_SPLIT_LAYER requires a backend for each GPU
            for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
-                ggml_backend_t backend = ggml_backend_sycl_init(i);
+                int id = ggml_backend_sycl_get_device_id(i);
+                ggml_backend_t backend = ggml_backend_sycl_init(id);
                if (backend == nullptr) {
-                    int id_list[GGML_SYCL_MAX_DEVICES];
-                    ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
-                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
+                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, id, i);
                    llama_free(ctx);
                    return nullptr;
                }