suport multiple cards: split-mode - layer|row

This commit is contained in:
Jianyu Zhang 2024-02-28 19:34:10 +08:00
parent cb49e0f8c9
commit f87da8ebf3
6 changed files with 1283 additions and 832 deletions

View file

@ -1,6 +1,7 @@
# llama.cpp for SYCL # llama.cpp for SYCL
- [Background](#background) - [Background](#background)
- [News](#news)
- [OS](#os) - [OS](#os)
- [Intel GPU](#intel-gpu) - [Intel GPU](#intel-gpu)
- [Docker](#docker) - [Docker](#docker)
@ -25,6 +26,16 @@ The llama.cpp for SYCL is used to support Intel GPUs.
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building). For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
## News
- 2024.3
- Support multiple cards: **--split-mode**: none|layer|row
- Support to assign main GPU by **--main-gpu**
- 2024.1
- Create SYCL backend for Intel GPU.
- Support Windows build
## OS ## OS
|OS|Status|Verified| |OS|Status|Verified|
@ -449,6 +460,7 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|-|-|-| |-|-|-|
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output| |GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG| |GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|
## Known Issue ## Known Issue
@ -458,6 +470,10 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
Solution: add **--no-mmap** or **--mmap 0**. Solution: add **--no-mmap** or **--mmap 0**.
- Run with split-mode: row is slow
It's limited by the missed function: memory copy from device to device. We use dirty solution. It will be fixed in the future.
## Q&A ## Q&A
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`. - Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.

View file

@ -125,18 +125,15 @@ static std::string get_gpu_info() {
#ifdef GGML_USE_SYCL #ifdef GGML_USE_SYCL
int device_list[GGML_SYCL_MAX_DEVICES]; int device_list[GGML_SYCL_MAX_DEVICES];
ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES); ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES);
int count = ggml_backend_sycl_get_device_count();
for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) { for (int i = 0; i < count; i++) {
if (device_list[i] >0 ){ char buf[128];
char buf[128]; ggml_sycl_get_device_description(i, buf, sizeof(buf));
ggml_sycl_get_device_description(i, buf, sizeof(buf)); id += buf;
id += buf; if (i < count - 1) {
id += "/"; id += "/";
} }
} }
if (id.length() >2 ) {
id.pop_back();
}
#endif #endif
// TODO: other backends // TODO: other backends
return id; return id;

View file

@ -8,12 +8,19 @@ INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
if [ $# -gt 0 ]; then if [ $# -gt 0 ]; then
export GGML_SYCL_DEVICE=$1 GGML_SYCL_DEVICE=$1
else else
export GGML_SYCL_DEVICE=0 GGML_SYCL_DEVICE=0
fi fi
echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE echo "use $GGML_SYCL_DEVICE as main GPU"
#export GGML_SYCL_DEBUG=1 #export GGML_SYCL_DEBUG=1
./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
#use all GPUs with same max compute units
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE
#use main GPU only
#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none

File diff suppressed because it is too large Load diff

View file

@ -24,6 +24,11 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
GGML_API void ggml_backend_sycl_print_sycl_devices(void); GGML_API void ggml_backend_sycl_print_sycl_devices(void);
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len); GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size); GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View file

@ -102,6 +102,10 @@
#define LLAMA_MAX_NODES 8192 #define LLAMA_MAX_NODES 8192
#define LLAMA_MAX_EXPERTS 8 #define LLAMA_MAX_EXPERTS 8
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
#define GGML_USE_CUBLAS_SYCL
#endif
// //
// logging // logging
// //
@ -1461,6 +1465,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
} }
#endif #endif
#ifdef GGML_USE_SYCL
if (ggml_backend_sycl_get_device_count() > 1) {
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
}
#endif
if (buft == nullptr) { if (buft == nullptr) {
buft = llama_default_buffer_type_offload(fallback_gpu); buft = llama_default_buffer_type_offload(fallback_gpu);
} }
@ -1472,6 +1482,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
static size_t llama_get_device_count() { static size_t llama_get_device_count() {
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUBLAS)
return ggml_backend_cuda_get_device_count(); return ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL)
return ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN) #elif defined(GGML_USE_VULKAN)
return ggml_backend_vk_get_device_count(); return ggml_backend_vk_get_device_count();
#else #else
@ -1485,6 +1497,11 @@ static size_t llama_get_device_memory(int device) {
size_t free; size_t free;
ggml_backend_cuda_get_device_memory(device, &total, &free); ggml_backend_cuda_get_device_memory(device, &total, &free);
return free; return free;
#elif defined(GGML_USE_SYCL)
size_t total;
size_t free;
ggml_backend_sycl_get_device_memory(device, &total, &free);
return free;
#elif defined(GGML_USE_VULKAN) #elif defined(GGML_USE_VULKAN)
size_t total; size_t total;
size_t free; size_t free;
@ -11974,13 +11991,31 @@ struct llama_context * llama_new_context_with_model(
} }
#elif defined(GGML_USE_SYCL) #elif defined(GGML_USE_SYCL)
if (model->n_gpu_layers > 0) { if (model->n_gpu_layers > 0) {
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
if (backend == nullptr) { if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu); int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
llama_free(ctx); ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
return nullptr; if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
} else {
// LLAMA_SPLIT_LAYER requires a backend for each GPU
int id_list[GGML_SYCL_MAX_DEVICES];
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
int device_id = id_list[i];
ggml_backend_t backend = ggml_backend_sycl_init(i);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
} }
ctx->backends.push_back(backend);
} }
#elif defined(GGML_USE_KOMPUTE) #elif defined(GGML_USE_KOMPUTE)
if (model->n_gpu_layers > 0) { if (model->n_gpu_layers > 0) {