suport multiple cards: split-mode - layer|row

2024-02-28 19:34:10 +08:00 · 2024-02-28 19:34:10 +08:00 · f87da8ebf3
commit f87da8ebf3
parent cb49e0f8c9
6 changed files with 1283 additions and 832 deletions
--- a/README-sycl.md
+++ b/README-sycl.md
@ -1,6 +1,7 @@
 # llama.cpp for SYCL
 - [Background](#background)
 - [News](#news)
 - [OS](#os)
 - [Intel GPU](#intel-gpu)
 - [Docker](#docker)
@ -25,6 +26,16 @@ The llama.cpp for SYCL is used to support Intel GPUs.
 For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
 ## News
 - 2024.3
  - Support multiple cards: **--split-mode**: none|layer|row
  - Support to assign main GPU by **--main-gpu**
 - 2024.1
  - Create SYCL backend for Intel GPU.
  - Support Windows build
 ## OS
 |OS|Status|Verified|
@ -449,6 +460,7 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
 |-|-|-|
 |GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
 |GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
 |ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|
 ## Known Issue
@ -458,6 +470,10 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
  Solution: add **--no-mmap** or **--mmap 0**.
 - Run with split-mode: row is slow
  It's limited by the missed function: memory copy from device to device. We use dirty solution. It will be fixed in the future.
 ## Q&A
 - Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -125,18 +125,15 @@ static std::string get_gpu_info() {
 #ifdef GGML_USE_SYCL
    int device_list[GGML_SYCL_MAX_DEVICES];
    ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES);
-
+    int count = ggml_backend_sycl_get_device_count();
-    for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
+    for (int i = 0; i < count; i++) {
-        if (device_list[i] >0 ){
+        char buf[128];
-            char buf[128];
+        ggml_sycl_get_device_description(i, buf, sizeof(buf));
-            ggml_sycl_get_device_description(i, buf, sizeof(buf));
+        id += buf;
-            id += buf;
+        if (i < count - 1) {
            id += "/";
        }
    }
    if (id.length() >2 ) {
        id.pop_back();
    }
 #endif
    // TODO: other backends
    return id;
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@ -8,12 +8,19 @@ INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 source /opt/intel/oneapi/setvars.sh
 if [ $# -gt 0 ]; then
-    export GGML_SYCL_DEVICE=$1
+    GGML_SYCL_DEVICE=$1
 else
-    export GGML_SYCL_DEVICE=0
+    GGML_SYCL_DEVICE=0
 fi
-echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
+echo "use $GGML_SYCL_DEVICE as main GPU"
 #export GGML_SYCL_DEBUG=1
-./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+
-#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
+
 #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
 #use all GPUs with same max compute units
 ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE
 #use main GPU only
 #ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@ -24,6 +24,11 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
 GGML_API void   ggml_backend_sycl_print_sycl_devices(void);
 GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len);
 GGML_API GGML_CALL void   ggml_sycl_get_device_description(int device, char *description, size_t description_size);
 GGML_API GGML_CALL int   ggml_backend_sycl_get_device_count();
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
 GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
 GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
 #ifdef  __cplusplus
 }
 #endif
--- a/llama.cpp
+++ b/llama.cpp
@ -102,6 +102,10 @@
 #define LLAMA_MAX_NODES   8192
 #define LLAMA_MAX_EXPERTS 8
 #if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
 #define GGML_USE_CUBLAS_SYCL
 #endif
 //
 // logging
 //
@ -1461,6 +1465,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
    }
 #endif
 #ifdef GGML_USE_SYCL
    if (ggml_backend_sycl_get_device_count() > 1) {
        buft = ggml_backend_sycl_split_buffer_type(tensor_split);
    }
 #endif
    if (buft == nullptr) {
        buft = llama_default_buffer_type_offload(fallback_gpu);
    }
@ -1472,6 +1482,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
 static size_t llama_get_device_count() {
 #if defined(GGML_USE_CUBLAS)
    return ggml_backend_cuda_get_device_count();
 #elif defined(GGML_USE_SYCL)
    return ggml_backend_sycl_get_device_count();
 #elif defined(GGML_USE_VULKAN)
    return ggml_backend_vk_get_device_count();
 #else
@ -1485,6 +1497,11 @@ static size_t llama_get_device_memory(int device) {
    size_t free;
    ggml_backend_cuda_get_device_memory(device, &total, &free);
    return free;
 #elif defined(GGML_USE_SYCL)
    size_t total;
    size_t free;
    ggml_backend_sycl_get_device_memory(device, &total, &free);
    return free;
 #elif defined(GGML_USE_VULKAN)
    size_t total;
    size_t free;
@ -11974,13 +11991,31 @@ struct llama_context * llama_new_context_with_model(
        }
 #elif defined(GGML_USE_SYCL)
        if (model->n_gpu_layers > 0) {
-            ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
+            // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
-            if (backend == nullptr) {
+            if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
-                LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
+                int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
-                llama_free(ctx);
+                ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
-                return nullptr;
+                if (backend == nullptr) {
                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
                    llama_free(ctx);
                    return nullptr;
                }
                ctx->backends.push_back(backend);
            } else {
                // LLAMA_SPLIT_LAYER requires a backend for each GPU
                int id_list[GGML_SYCL_MAX_DEVICES];
                ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
                for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
                    int device_id = id_list[i];
                    ggml_backend_t backend = ggml_backend_sycl_init(i);
                    if (backend == nullptr) {
                        LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
                        llama_free(ctx);
                        return nullptr;
                    }
                    ctx->backends.push_back(backend);
                }
            }
            ctx->backends.push_back(backend);
        }
 #elif defined(GGML_USE_KOMPUTE)
        if (model->n_gpu_layers > 0) {