diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 81652047d..afb9a1f0e 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -48,6 +48,11 @@ static int g_ggml_sycl_debug = 0; } \ }() +// #define DEBUG_SYCL_MALLOC + +static int g_work_group_size = -1; +// typedef sycl::half ggml_fp16_t; + #define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP #define VER_4VEC 610 // todo for hardward optimize. #define VER_GEN9 700 // todo for hardward optimize. @@ -163,6 +168,8 @@ int get_current_device_id(); (void)bad_arch; // suppress unused function warning } +int get_current_device_id(); + inline dpct::err0 ggml_sycl_set_device(const int device_id) try { int current_device_id; @@ -217,11 +224,11 @@ struct ggml_sycl_device_info { bool sycl_visible_devices_existed = false; struct sycl_device_info { - int cc; // compute capability + int cc; // compute capability // int nsm; // number of streaming multiprocessors // size_t smpb; // max. shared memory per block - bool vmm; // virtual memory support - size_t total_vram; + bool vmm; // virtual memory support + size_t total_vram; }; sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {}; diff --git a/src/llama.cpp b/src/llama.cpp index 1a7d24ccb..305dda36e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17620,6 +17620,7 @@ struct llama_context * llama_new_context_with_model( #elif defined(GGML_USE_SYCL) // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { + ggml_sycl_set_single_device(model->main_gpu); ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);