Support multiple GPUs (split mode) on SYCL backend (#5806)

* suport multiple cards: split-mode - layer|row * rm warning * rebase with master, support tow new OPs, close feature for -sm=row, fix for unit test * update news * fix merge error * update according to review comments
2024-03-02 19:49:30 +08:00 · 2024-03-02 19:49:30 +08:00 · 715641391d
commit 715641391d
parent 9bf297a02b
8 changed files with 1506 additions and 814 deletions
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -123,20 +123,15 @@ static std::string get_gpu_info() {
    }
 #endif
 #ifdef GGML_USE_SYCL
-    int device_list[GGML_SYCL_MAX_DEVICES];
-    ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES);
-
-    for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
-        if (device_list[i] >0 ){
-            char buf[128];
-            ggml_sycl_get_device_description(i, buf, sizeof(buf));
-            id += buf;
+    int count = ggml_backend_sycl_get_device_count();
+    for (int i = 0; i < count; i++) {
+        char buf[128];
+        ggml_sycl_get_device_description(i, buf, sizeof(buf));
+        id += buf;
+        if (i < count - 1) {
            id += "/";
        }
    }
-    if (id.length() >2 ) {
-        id.pop_back();
-    }
 #endif
    // TODO: other backends
    return id;
--- a/examples/sycl/ls-sycl-device.cpp
+++ b/examples/sycl/ls-sycl-device.cpp
@ -7,7 +7,7 @@

 #include "ggml-sycl.h"

-int main(int argc, char ** argv) {
+int main() {
    ggml_backend_sycl_print_sycl_devices();
    return 0;
 }
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@ -8,12 +8,19 @@ INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 source /opt/intel/oneapi/setvars.sh

 if [ $# -gt 0 ]; then
-    export GGML_SYCL_DEVICE=$1
+    GGML_SYCL_DEVICE=$1
 else
-    export GGML_SYCL_DEVICE=0
+    GGML_SYCL_DEVICE=0
 fi
-echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
+echo "use $GGML_SYCL_DEVICE as main GPU"
 #export GGML_SYCL_DEBUG=1
-./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
-#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
+
+
+#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
+
+#use all GPUs with same max compute units
+ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+
+#use main GPU only
+#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none