fix to support multiple GPUs, fix set single device, unify id/device_id/device_index

This commit is contained in:
Jianyu Zhang 2024-06-19 22:54:15 +08:00 committed by Neo Zhang
parent a9f3b10215
commit de2763118f
4 changed files with 453 additions and 221 deletions

View file

@ -34,6 +34,12 @@ GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *des
GGML_API GGML_CALL int ggml_backend_sycl_get_device_count(); GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
GGML_API GGML_CALL void ggml_sycl_set_single_device(int main_gpu_id);
// GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
// GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
// SYCL doesn't support registering host memory, keep here for reference // SYCL doesn't support registering host memory, keep here for reference
// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size); // GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer); // GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);

View file

@ -51,6 +51,90 @@ static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer);
static inline int get_sycl_env(const char *env_name, int default_val); static inline int get_sycl_env(const char *env_name, int default_val);
static inline int get_work_group_size(const sycl::device& device); static inline int get_work_group_size(const sycl::device& device);
static bool g_sycl_loaded = false;
bool ggml_sycl_loaded(void) {
return g_sycl_loaded;
}
int get_sycl_env(const char *env_name, int default_val) {
char *user_device_string = getenv(env_name);
int user_number = default_val;
unsigned n;
if (user_device_string != NULL &&
sscanf(user_device_string, " %u", &n) == 1) {
user_number = (int)n;
} else {
user_number = default_val;
}
return user_number;
}
static ggml_sycl_device_info ggml_sycl_init() try {
static bool initialized = false;
if (!initialized) {
fprintf(stderr, "[SYCL] call ggml_init_sycl\n");
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__,
g_ggml_sycl_debug);
#if defined(GGML_SYCL_F16)
fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__);
#else
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
#endif
#if defined(GGML_SYCL_FORCE_MMQ)
fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: yes\n", __func__);
#else
fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: no\n", __func__);
#endif
#if defined(SYCL_USE_XMX)
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
#else
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
#endif
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
dpct::dev_mgr::instance().device_count()) !=
0) {
initialized = true;
g_sycl_loaded = false;
return;
}
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
ggml_backend_sycl_print_sycl_devices();
initialized = true;
g_sycl_loaded = true;
}
static ggml_sycl_device_info info = {};
info.refresh_device(SYCL_MUL_GPU_MODE);
if (info.device_count == 0) {
fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": %s\n",
__func__);
return info;
}
GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES);
return info;
} catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
<< ", line:" << __LINE__ << std::endl;
std::exit(1);
}
ggml_sycl_device_info &ggml_sycl_info() {
static ggml_sycl_device_info info = ggml_sycl_init();
return info;
}
void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
const void *ptr_src, size_t size) { const void *ptr_src, size_t size) {
char *host_buf = (char *)malloc(size); char *host_buf = (char *)malloc(size);
@ -2016,12 +2100,6 @@ static void im2col_sycl(const float *x, T *dst, int IW, int IH,
} }
static bool g_sycl_loaded = false;
bool ggml_sycl_loaded(void) {
return g_sycl_loaded;
}
void print_device_detail(int id, sycl::device &device, std::string device_type) { void print_device_detail(int id, sycl::device &device, std::string device_type) {
dpct::device_info prop; dpct::device_info prop;
@ -2121,67 +2199,20 @@ catch (sycl::exception const &exc) {
std::exit(1); std::exit(1);
} }
static ggml_sycl_device_info ggml_sycl_init() { GGML_API GGML_CALL void ggml_sycl_set_single_device(int device_id) {
ggml_sycl_device_info info = {}; ggml_sycl_info().refresh_device(SYCL_SINGLE_GPU_MODE, device_id);
ggml_sycl_set_main_device(device_id);
info.device_count = dpct::dev_mgr::instance().device_count();
if (info.device_count == 0) {
fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__);
return info;
} }
GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES); inline void check_allow_device_id(const int device_id) {
if (device_id >= ggml_sycl_info().device_count) {
int64_t total_vram = 0;
#if defined(GGML_SYCL_FORCE_MMQ)
fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: yes\n", __func__);
#else
fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: no\n", __func__);
#endif
#if defined(SYCL_USE_XMX)
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
#else
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
#endif
fprintf(stderr, "%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count);
for (int i = 0; i < info.device_count; ++i) {
info.devices[i].vmm = 0;
dpct::device_info prop;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
prop, dpct::dev_mgr::instance().get_device(i))));
info.default_tensor_split[i] = total_vram;
total_vram += prop.get_global_mem_size();
info.devices[i].cc =
100 * prop.get_major_version() + 10 * prop.get_minor_version();
}
for (int id = 0; id < info.device_count; ++id) {
info.default_tensor_split[id] /= total_vram;
}
return info;
}
const ggml_sycl_device_info & ggml_sycl_info() {
static ggml_sycl_device_info info = ggml_sycl_init();
return info;
}
/*
device_index: device index from 0 to n (continue numbers).
It is used for device select/set in SYCL backend internal data structure.
*/
inline void check_allow_gpu_index(const int device_index) {
if (device_index >= ggml_sycl_info().device_count) {
char error_buf[256]; char error_buf[256];
snprintf( snprintf(
error_buf, error_buf,
sizeof(error_buf), sizeof(error_buf),
"%s error: device_index:%d is out of range: [0-%d]", "%s error: device_id:%d is out of range: [0-%d]",
__func__, __func__,
device_index, device_id,
ggml_sycl_info().device_count - 1); ggml_sycl_info().device_count - 1);
fprintf(stderr, "%s\n", error_buf); fprintf(stderr, "%s\n", error_buf);
assert(false); assert(false);
@ -2192,7 +2223,7 @@ inline void check_allow_gpu_index(const int device_index) {
struct ggml_sycl_pool_leg : public ggml_sycl_pool { struct ggml_sycl_pool_leg : public ggml_sycl_pool {
static const int MAX_SYCL_BUFFERS = 256; static const int MAX_SYCL_BUFFERS = 256;
int device; int device_id;
queue_ptr qptr; queue_ptr qptr;
struct ggml_sycl_buffer { struct ggml_sycl_buffer {
void * ptr = nullptr; void * ptr = nullptr;
@ -2204,7 +2235,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) :
qptr(qptr_), qptr(qptr_),
device(device_) { device_id(device_) {
} }
~ggml_sycl_pool_leg() { ~ggml_sycl_pool_leg() {
@ -2288,12 +2319,12 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
} }
}; };
std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) { std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device_id) {
// TBD: NO VMM support // TBD: NO VMM support
// if (ggml_sycl_info().devices[device].vmm) { // if (ggml_sycl_info().devices[device_id].vmm) {
// return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(device)); // return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(device_id));
// } // }
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device)); return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device_id));
} }
// TBD pool with virtual memory management // TBD pool with virtual memory management
@ -2704,13 +2735,13 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor
static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split) { static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split) {
int64_t min_compute_capability = INT_MAX; int64_t min_compute_capability = INT_MAX;
int64_t max_compute_capability = INT_MIN; int64_t max_compute_capability = INT_MIN;
for (int i = 0; i < ggml_sycl_info().device_count; ++i) { for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
if (tensor_split[i] < (i + 1 < ggml_sycl_info().device_count ? tensor_split[i + 1] : 1.0f)) { if (tensor_split[id] < (id + 1 < ggml_sycl_info().device_count ? tensor_split[id + 1] : 1.0f)) {
if (min_compute_capability > ggml_sycl_info().devices[i].cc) { if (min_compute_capability > ggml_sycl_info().devices[id].cc) {
min_compute_capability = ggml_sycl_info().devices[i].cc; min_compute_capability = ggml_sycl_info().devices[id].cc;
} }
if (max_compute_capability < ggml_sycl_info().devices[i].cc) { if (max_compute_capability < ggml_sycl_info().devices[id].cc) {
max_compute_capability = ggml_sycl_info().devices[i].cc; max_compute_capability = ggml_sycl_info().devices[id].cc;
} }
} }
} }
@ -3132,18 +3163,18 @@ static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
} }
#ifdef NDEBUG #ifdef NDEBUG
for (int i = 0; i < ggml_sycl_info().device_count; ++i) { for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
SYCL_CHECK(ggml_sycl_set_device(i)); SYCL_CHECK(ggml_sycl_set_device(id));
} }
for (int i = 0; i < ggml_sycl_info().device_count; ++i) { for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
SYCL_CHECK(ggml_sycl_set_device(i)); SYCL_CHECK(ggml_sycl_set_device(id));
for (int id_other = 0; id_other < ggml_sycl_info().device_count; ++id_other) { for (int id_other = 0; id_other < ggml_sycl_info().device_count; ++id_other) {
if (i == id_other) { if (id == id_other) {
continue; continue;
} }
if (i != main_device && id_other != main_device) { if (id != main_device && id_other != main_device) {
continue; continue;
} }
@ -3240,62 +3271,62 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
int used_devices = 0; int used_devices = 0;
queue_ptr main_stream = ctx.stream(); queue_ptr main_stream = ctx.stream();
for (int i = 0; i < ggml_sycl_info().device_count; ++i) { for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
// by default, use all rows // by default, use all rows
dev[i].row_low = 0; dev[id].row_low = 0;
dev[i].row_high = ne01; dev[id].row_high = ne01;
// for multi GPU, get the row boundaries from tensor split // for multi GPU, get the row boundaries from tensor split
// and round to mul_mat_q tile sizes // and round to mul_mat_q tile sizes
if (split) { if (split) {
const int64_t rounding = get_row_rounding(src0->type, tensor_split); const int64_t rounding = get_row_rounding(src0->type, tensor_split);
if (i != 0) { if (id != 0) {
dev[i].row_low = ne01*tensor_split[i]; dev[id].row_low = ne01*tensor_split[id];
if (dev[i].row_low < ne01) { if (dev[id].row_low < ne01) {
dev[i].row_low -= dev[i].row_low % rounding; dev[id].row_low -= dev[id].row_low % rounding;
} }
} }
if (i != ggml_sycl_info().device_count - 1) { if (id != ggml_sycl_info().device_count - 1) {
dev[i].row_high = ne01*tensor_split[i + 1]; dev[id].row_high = ne01*tensor_split[id + 1];
if (dev[i].row_high < ne01) { if (dev[id].row_high < ne01) {
dev[i].row_high -= dev[i].row_high % rounding; dev[id].row_high -= dev[id].row_high % rounding;
} }
} }
} }
} }
for (int i = 0; i < ggml_sycl_info().device_count; ++i) { for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) { if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
continue; continue;
} }
used_devices++; used_devices++;
const bool src1_on_device = i == ctx.device; const bool src1_on_device = id == ctx.device;
const bool dst_on_device = i == ctx.device; const bool dst_on_device = id == ctx.device;
ggml_sycl_set_device(i); ggml_sycl_set_device(id);
queue_ptr stream = ctx.stream(i, 0); queue_ptr stream = ctx.stream(id, 0);
if (src0_is_contiguous) { if (src0_is_contiguous) {
dev[i].src0_dd = (char *) src0->data; dev[id].src0_dd = (char *) src0->data;
} else { } else {
dev[i].src0_dd = dev[i].src0_dd_alloc.alloc(ctx.pool(i), ggml_nbytes(src0)); dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
} }
if (src1_on_device && src1_is_contiguous) { if (src1_on_device && src1_is_contiguous) {
dev[i].src1_ddf = (float *) src1->data; dev[id].src1_ddf = (float *) src1->data;
} else { } else {
dev[i].src1_ddf = dev[i].src1_ddf_alloc.alloc(ctx.pool(i), ggml_nelements(src1)); dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
} }
if (convert_src1_to_q8_1) { if (convert_src1_to_q8_1) {
dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs); dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
if (src1_on_device && src1_is_contiguous) { if (src1_on_device && src1_is_contiguous) {
quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream); quantize_row_q8_1_sycl(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
/* /*
DPCT1010:90: SYCL uses exceptions to report errors and does not DPCT1010:90: SYCL uses exceptions to report errors and does not
use the error codes. The call was replaced with 0. You need to use the error codes. The call was replaced with 0. You need to
@ -3306,10 +3337,10 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
} }
if (dst_on_device) { if (dst_on_device) {
dev[i].dst_dd = (float *) dst->data; dev[id].dst_dd = (float *) dst->data;
} else { } else {
const size_t size_dst_ddf = split ? (dev[i].row_high - dev[i].row_low)*ne1 : ggml_nelements(dst); const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
dev[i].dst_dd = dev[i].dst_dd_alloc.alloc(ctx.pool(i), size_dst_ddf); dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(ctx.pool(id), size_dst_ddf);
} }
} }
@ -3332,20 +3363,20 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_SYCL_MAX_STREAMS : 0; const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_SYCL_MAX_STREAMS : 0;
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride; const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
for (int i = 0; i < ggml_sycl_info().device_count; ++i) { for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) { if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
continue; continue;
} }
const bool src1_on_device = i == ctx.device; const bool src1_on_device = id == ctx.device;
const bool dst_on_device = i == ctx.device; const bool dst_on_device = id == ctx.device;
const int64_t row_diff = dev[i].row_high - dev[i].row_low; const int64_t row_diff = dev[id].row_high - dev[id].row_low;
ggml_sycl_set_device(i); ggml_sycl_set_device(id);
queue_ptr stream = ctx.stream(i, is); queue_ptr stream = ctx.stream(id, is);
// wait for main GPU data if necessary // wait for main GPU data if necessary
if (split && (i != ctx.device || is != 0)) { if (split && (id != ctx.device || is != 0)) {
/* /*
DPCT1009:163: SYCL uses exceptions to report errors and does not DPCT1009:163: SYCL uses exceptions to report errors and does not
use the error codes. The original code was commented out and a use the error codes. The original code was commented out and a
@ -3362,20 +3393,20 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs; const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
// for split tensors the data begins at i0 == i0_offset_low // for split tensors the data begins at i0 == i0_offset_low
char * src0_dd_i = dev[i].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs; char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
float * src1_ddf_i = dev[i].src1_ddf + (i0*ne11 + src1_col_0) * ne10; float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
char * src1_ddq_i = dev[i].src1_ddq + src1_ddq_i_offset; char * src1_ddq_i = dev[id].src1_ddq + src1_ddq_i_offset;
float * dst_dd_i = dev[i].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff); float * dst_dd_i = dev[id].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
// the main device memory buffer can be on VRAM scratch, with space for all partial results // the main device memory buffer can be on VRAM scratch, with space for all partial results
// in that case an offset on dst_ddf_i is needed // in that case an offset on dst_ddf_i is needed
if (i == ctx.device) { if (id == ctx.device) {
dst_dd_i += dev[i].row_low; // offset is 0 if no tensor split dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
} }
// copy src0, src1 to device if necessary // copy src0, src1 to device if necessary
if (src1_is_contiguous) { if (src1_is_contiguous) {
if (i != ctx.device) { if (id != ctx.device) {
if (convert_src1_to_q8_1) { if (convert_src1_to_q8_1) {
char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset; char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy( SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
@ -3410,14 +3441,14 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
} }
if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) { if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[i].row_low, dev[i].row_high, stream)); SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
} }
if (src1->type == GGML_TYPE_F16) { if (src1->type == GGML_TYPE_F16) {
src1_padded_col_size = (i0 * ne11 + src1_col_0) * ne10; src1_padded_col_size = (i0 * ne11 + src1_col_0) * ne10;
} }
// do the computation // do the computation
SYCL_CHECK(CHECK_TRY_ERROR(op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i, SYCL_CHECK(CHECK_TRY_ERROR(op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream))); dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream)));
/* /*
DPCT1010:93: SYCL uses exceptions to report errors and does not DPCT1010:93: SYCL uses exceptions to report errors and does not
use the error codes. The call was replaced with 0. You need to use the error codes. The call was replaced with 0. You need to
@ -3436,7 +3467,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results. // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
dhf_dst_i += src1_col_0*ne0 + dev[i].row_low; dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy( SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
dhf_dst_i, ne0 * sizeof(float), dst_dd_i, dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
@ -3453,7 +3484,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
} }
// add event for the main device to wait on until other device is done // add event for the main device to wait on until other device is done
if (split && (i != ctx.device || is != 0)) { if (split && (id != ctx.device || is != 0)) {
/* /*
DPCT1024:94: The original code returned the error code that DPCT1024:94: The original code returned the error code that
was further consumed by the program logic. This original was further consumed by the program logic. This original
@ -3461,7 +3492,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
program logic consuming the error code. program logic consuming the error code.
*/ */
SYCL_CHECK(CHECK_TRY_ERROR( SYCL_CHECK(CHECK_TRY_ERROR(
*src0_extra->events[i][is] = *src0_extra->events[id][is] =
stream->ext_oneapi_submit_barrier())); stream->ext_oneapi_submit_barrier()));
} }
} }
@ -3474,14 +3505,14 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
is_max = is_max <= GGML_SYCL_MAX_STREAMS ? is_max : GGML_SYCL_MAX_STREAMS; is_max = is_max <= GGML_SYCL_MAX_STREAMS ? is_max : GGML_SYCL_MAX_STREAMS;
ggml_sycl_set_device(ctx.device); ggml_sycl_set_device(ctx.device);
for (int i = 0; i < ggml_sycl_info().device_count; ++i) { for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
if (dev[i].row_low == dev[i].row_high) { if (dev[id].row_low == dev[id].row_high) {
continue; continue;
} }
for (int64_t is = 0; is < is_max; ++is) { for (int64_t is = 0; is < is_max; ++is) {
SYCL_CHECK(CHECK_TRY_ERROR( SYCL_CHECK(CHECK_TRY_ERROR(
ctx.stream()->ext_oneapi_submit_barrier( ctx.stream()->ext_oneapi_submit_barrier(
{*src0_extra->events[i][is]}))); {*src0_extra->events[id][is]})));
} }
} }
} }
@ -4310,17 +4341,17 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
} }
void ggml_sycl_set_main_device(const int main_device) try { void ggml_sycl_set_main_device(const int main_device_id) try {
if (dpct::get_current_device_id() == main_device) return; if (dpct::get_current_device_id() == main_device_id) return;
check_allow_gpu_index(main_device); check_allow_device_id(main_device_id);
dpct::select_device(main_device); dpct::select_device(main_device_id);
if (g_ggml_sycl_debug) { if (g_ggml_sycl_debug) {
dpct::device_info prop; dpct::device_info prop;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
prop, dpct::dev_mgr::instance().get_device(main_device)))); prop, dpct::dev_mgr::instance().get_device(main_device_id))));
fprintf(stderr, "Using device %d (%s) as main device\n", fprintf(stderr, "Using device %d (%s) as main device\n",
main_device, prop.get_name()); main_device_id, prop.get_name());
} }
} }
catch (sycl::exception const &exc) { catch (sycl::exception const &exc) {
@ -4475,9 +4506,9 @@ GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n"); GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
for(int i=0;i<max_len;i++) id_list[i] = -1; for(int i=0;i<max_len;i++) id_list[i] = -1;
for (int i=0;i< ggml_sycl_info().device_count;i++){ for (int id=0;id< ggml_sycl_info().device_count;id++){
if (i>=max_len) break; if (id>=max_len) break;
id_list[i] = i; id_list[id] = id;
} }
return; return;
} }
@ -4501,12 +4532,12 @@ catch (sycl::exception const &exc) {
std::exit(1); std::exit(1);
} }
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, GGML_API GGML_CALL void ggml_sycl_get_device_description(int device_id, char *description,
size_t description_size) try { size_t description_size) try {
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n"); GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
dpct::device_info prop; dpct::device_info prop;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
prop, dpct::dev_mgr::instance().get_device(device)))); prop, dpct::dev_mgr::instance().get_device(device_id))));
snprintf(description, description_size, "%s", prop.get_name()); snprintf(description, description_size, "%s", prop.get_name());
} }
catch (sycl::exception const &exc) { catch (sycl::exception const &exc) {
@ -4515,10 +4546,10 @@ catch (sycl::exception const &exc) {
std::exit(1); std::exit(1);
} }
GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, GGML_CALL void ggml_backend_sycl_get_device_memory(int device_id, size_t *free,
size_t *total) try { size_t *total) try {
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n"); GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
ggml_sycl_set_device(device); ggml_sycl_set_device(device_id);
/* /*
DPCT1009:218: SYCL uses exceptions to report errors and does not use the DPCT1009:218: SYCL uses exceptions to report errors and does not use the
@ -4531,7 +4562,7 @@ GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
You may need to adjust the code. You may need to adjust the code.
*/ */
SYCL_CHECK(CHECK_TRY_ERROR( SYCL_CHECK(CHECK_TRY_ERROR(
dpct::dev_mgr::instance().get_device(device).get_memory_info(*free, *total))); dpct::dev_mgr::instance().get_device(device_id).get_memory_info(*free, *total)));
} }
catch (sycl::exception const &exc) { catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@ -4553,9 +4584,9 @@ struct ggml_backend_sycl_buffer_context {
queue_ptr stream; queue_ptr stream;
std::string name; std::string name;
ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) : ggml_backend_sycl_buffer_context(int device_id, void * dev_ptr, queue_ptr stream) :
device(device), dev_ptr(dev_ptr), stream(stream) { device(device_id), dev_ptr(dev_ptr), stream(stream) {
check_allow_gpu_index(device); check_allow_device_id(device);
name = (GGML_SYCL_NAME + std::to_string(device)); name = (GGML_SYCL_NAME + std::to_string(device));
} }
@ -4831,58 +4862,58 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
/* .is_host = */ nullptr, /* .is_host = */ nullptr,
}; };
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) {
static std::mutex mutex; static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex); std::lock_guard<std::mutex> lock(mutex);
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n"); GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
if (device>=ggml_sycl_info().device_count or device<0) { if (device_id>=ggml_sycl_info().device_count or device_id<0) {
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", printf("ggml_backend_sycl_buffer_type error: device_id:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
device, ggml_sycl_info().device_count-1); device_id, ggml_sycl_info().device_count-1);
GGML_ASSERT(device<ggml_sycl_info().device_count); GGML_ASSERT(device_id<ggml_sycl_info().device_count);
} }
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES]; static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
static bool ggml_backend_sycl_buffer_type_initialized = false; static bool ggml_backend_sycl_buffer_type_initialized = false;
if (!ggml_backend_sycl_buffer_type_initialized) { if (!ggml_backend_sycl_buffer_type_initialized) {
for (int i = 0; i < ggml_sycl_info().device_count; i++) { for (int id = 0; id < ggml_sycl_info().device_count; id++) {
auto & device_i = dpct::dev_mgr::instance().get_device(i); auto & device = dpct::dev_mgr::instance().get_device(id);
queue_ptr stream = &(device_i.default_queue()); queue_ptr stream = &(device.default_queue());
ggml_backend_sycl_buffer_types[i] = { ggml_backend_sycl_buffer_types[id] = {
/* .iface = */ ggml_backend_sycl_buffer_type_interface, /* .iface = */ ggml_backend_sycl_buffer_type_interface,
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), stream}, /* .context = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), stream},
}; };
} }
ggml_backend_sycl_buffer_type_initialized = true; ggml_backend_sycl_buffer_type_initialized = true;
} }
return &ggml_backend_sycl_buffer_types[device]; return &ggml_backend_sycl_buffer_types[device_id];
} }
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) { ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n"); GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
int device = ctx->device; int device_id = ctx->device;
if (device>=ggml_sycl_info().device_count or device<0) { if (device_id>=ggml_sycl_info().device_count or device_id<0) {
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", printf("ggml_backend_sycl_buffer_type error: device_id:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
device, ggml_sycl_info().device_count-1); device_id, ggml_sycl_info().device_count-1);
GGML_ASSERT(device<ggml_sycl_info().device_count); GGML_ASSERT(device_id<ggml_sycl_info().device_count);
} }
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES]; static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
static bool ggml_backend_sycl_buffer_type_initialized = false; static bool ggml_backend_sycl_buffer_type_initialized = false;
if (!ggml_backend_sycl_buffer_type_initialized) { if (!ggml_backend_sycl_buffer_type_initialized) {
for (int i = 0; i < ggml_sycl_info().device_count; i++) { for (int id = 0; id < ggml_sycl_info().device_count; id++) {
ggml_backend_sycl_buffer_types[i] = { ggml_backend_sycl_buffer_types[id] = {
/* .iface = */ ggml_backend_sycl_buffer_type_interface, /* .iface = */ ggml_backend_sycl_buffer_type_interface,
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), ctx->stream(i, 0)}, /* .context = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), ctx->stream(id, 0)},
}; };
} }
ggml_backend_sycl_buffer_type_initialized = true; ggml_backend_sycl_buffer_type_initialized = true;
} }
return &ggml_backend_sycl_buffer_types[device]; return &ggml_backend_sycl_buffer_types[device_id];
} }
// sycl split buffer type // sycl split buffer type
@ -4903,9 +4934,9 @@ static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tens
struct ggml_backend_sycl_split_buffer_context { struct ggml_backend_sycl_split_buffer_context {
~ggml_backend_sycl_split_buffer_context() try { ~ggml_backend_sycl_split_buffer_context() try {
for (ggml_tensor_extra_gpu * extra : tensor_extras) { for (ggml_tensor_extra_gpu * extra : tensor_extras) {
for (int i = 0; i < ggml_sycl_info().device_count; ++i) { for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) { for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
if (extra->events[i][is] != nullptr) { if (extra->events[id][is] != nullptr) {
/* /*
DPCT1009:206: SYCL uses exceptions to report errors and DPCT1009:206: SYCL uses exceptions to report errors and
does not use the error codes. The original code was does not use the error codes. The original code was
@ -4913,19 +4944,19 @@ struct ggml_backend_sycl_split_buffer_context {
need to rewrite this code. need to rewrite this code.
*/ */
SYCL_CHECK(CHECK_TRY_ERROR( SYCL_CHECK(CHECK_TRY_ERROR(
dpct::destroy_event(extra->events[i][is]))); dpct::destroy_event(extra->events[id][is])));
} }
} }
if (extra->data_device[i] != nullptr) { if (extra->data_device[id] != nullptr) {
/* /*
DPCT1009:207: SYCL uses exceptions to report errors and does DPCT1009:207: SYCL uses exceptions to report errors and does
not use the error codes. The original code was commented out not use the error codes. The original code was commented out
and a warning string was inserted. You need to rewrite this and a warning string was inserted. You need to rewrite this
code. code.
*/ */
ggml_sycl_set_device(i); ggml_sycl_set_device(id);
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free( SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(
extra->data_device[i], *(streams[i])))); extra->data_device[id], *(streams[id]))));
} }
} }
delete extra; delete extra;
@ -4978,9 +5009,9 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
ctx->tensor_extras.push_back(extra); ctx->tensor_extras.push_back(extra);
ctx->streams.push_back(&(dpct::get_current_device().default_queue())); ctx->streams.push_back(&(dpct::get_current_device().default_queue()));
for (int i = 0; i < ggml_sycl_info().device_count; ++i) { for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
int64_t row_low, row_high; int64_t row_low, row_high;
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i); get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
int64_t nrows_split = row_high - row_low; int64_t nrows_split = row_high - row_low;
if (nrows_split == 0) { if (nrows_split == 0) {
@ -4997,8 +5028,8 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
// FIXME: do not crash if cudaMalloc fails // FIXME: do not crash if cudaMalloc fails
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
ggml_sycl_set_device(i); ggml_sycl_set_device(id);
const queue_ptr stream = ctx->streams[i]; const queue_ptr stream = ctx->streams[id];
char * buf; char * buf;
/* /*
DPCT1009:208: SYCL uses exceptions to report errors and does not use the DPCT1009:208: SYCL uses exceptions to report errors and does not use the
@ -5021,7 +5052,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
.wait())); .wait()));
} }
extra->data_device[i] = buf; extra->data_device[id] = buf;
for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) { for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
/* /*
@ -5030,7 +5061,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
string was inserted. You need to rewrite this code. string was inserted. You need to rewrite this code.
*/ */
SYCL_CHECK( SYCL_CHECK(
CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event())); CHECK_TRY_ERROR(extra->events[id][is] = new sycl::event()));
} }
} }
tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT; tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT;
@ -5057,9 +5088,9 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
const size_t nb1 = tensor->nb[1]; const size_t nb1 = tensor->nb[1];
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
for (int i = 0; i < ggml_sycl_info().device_count; ++i) { for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
int64_t row_low, row_high; int64_t row_low, row_high;
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i); get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
int64_t nrows_split = row_high - row_low; int64_t nrows_split = row_high - row_low;
if (nrows_split == 0) { if (nrows_split == 0) {
@ -5081,11 +5112,11 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
error codes. The original code was commented out and a warning string error codes. The original code was commented out and a warning string
was inserted. You need to rewrite this code. was inserted. You need to rewrite this code.
*/ */
ggml_sycl_set_device(i); ggml_sycl_set_device(id);
const queue_ptr stream = ctx->streams[i]; const queue_ptr stream = ctx->streams[id];
SYCL_CHECK(CHECK_TRY_ERROR( SYCL_CHECK(CHECK_TRY_ERROR(
(*stream) (*stream)
.memcpy(extra->data_device[i], buf_host, original_size) .memcpy(extra->data_device[id], buf_host, original_size)
.wait())); .wait()));
} }
} }
@ -5110,9 +5141,9 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
const size_t nb1 = tensor->nb[1]; const size_t nb1 = tensor->nb[1];
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
for (int i = 0; i < ggml_sycl_info().device_count; ++i) { for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
int64_t row_low, row_high; int64_t row_low, row_high;
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i); get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
int64_t nrows_split = row_high - row_low; int64_t nrows_split = row_high - row_low;
if (nrows_split == 0) { if (nrows_split == 0) {
@ -5134,11 +5165,11 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
error codes. The original code was commented out and a warning string error codes. The original code was commented out and a warning string
was inserted. You need to rewrite this code. was inserted. You need to rewrite this code.
*/ */
ggml_sycl_set_device(i); ggml_sycl_set_device(id);
const queue_ptr stream = ctx->streams[i]; const queue_ptr stream = ctx->streams[id];
SYCL_CHECK(CHECK_TRY_ERROR( SYCL_CHECK(CHECK_TRY_ERROR(
(*stream) (*stream)
.memcpy(buf_host, extra->data_device[i], original_size) .memcpy(buf_host, extra->data_device[id], original_size)
.wait())); .wait()));
} }
} }
@ -5193,9 +5224,9 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_
const int64_t ne0 = tensor->ne[0]; const int64_t ne0 = tensor->ne[0];
for (int i = 0; i < ggml_sycl_info().device_count; ++i) { for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
int64_t row_low, row_high; int64_t row_low, row_high;
get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, i); get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
int64_t nrows_split = row_high - row_low; int64_t nrows_split = row_high - row_low;
if (nrows_split == 0) { if (nrows_split == 0) {
@ -5244,12 +5275,12 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const f
tensor_split_arr = ggml_sycl_info().default_tensor_split; tensor_split_arr = ggml_sycl_info().default_tensor_split;
} else { } else {
float split_sum = 0.0f; float split_sum = 0.0f;
for (int i = 0; i < ggml_sycl_info().device_count; ++i) { for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
tensor_split_arr[i] = split_sum; tensor_split_arr[id] = split_sum;
split_sum += tensor_split[i]; split_sum += tensor_split[id];
} }
for (int i = 0; i < ggml_sycl_info().device_count; ++i) { for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
tensor_split_arr[i] /= split_sum; tensor_split_arr[id] /= split_sum;
} }
} }
@ -5613,13 +5644,10 @@ static ggml_guid_t ggml_backend_sycl_guid() {
return &guid; return &guid;
} }
GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) { GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device_id) {
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n"); GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init, device_id=%d\n", device_id);
ggml_check_sycl(); check_allow_device_id(device_id);
ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context(ggml_sycl_info(), device_id);
check_allow_gpu_index(device);
ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context(device);
if (ctx == nullptr) { if (ctx == nullptr) {
fprintf(stderr, "%s: error: failed to allocate context\n", __func__); fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
return nullptr; return nullptr;
@ -5654,10 +5682,10 @@ extern "C" int ggml_backend_sycl_reg_devices();
int ggml_backend_sycl_reg_devices() { int ggml_backend_sycl_reg_devices() {
assert(ggml_sycl_info().device_count>0); assert(ggml_sycl_info().device_count>0);
for (int i = 0; i < ggml_sycl_info().device_count; i++) { for (int id = 0; id < ggml_sycl_info().device_count; id++) {
char name[128]; char name[128];
snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, i); snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, id);
ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(i), (void *) (intptr_t) i); ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(id), (void *) (intptr_t) id);
} }
return ggml_sycl_info().device_count; return ggml_sycl_info().device_count;
} }

View file

@ -49,7 +49,7 @@ static int g_ggml_sycl_debug = 0;
// #define DEBUG_SYCL_MALLOC // #define DEBUG_SYCL_MALLOC
static int g_work_group_size = 0; static int g_work_group_size = -1;
// typedef sycl::half ggml_fp16_t; // typedef sycl::half ggml_fp16_t;
#define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP #define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
@ -89,6 +89,11 @@ enum ggml_sycl_backend_gpu_mode {
SYCL_MUL_GPU_MODE SYCL_MUL_GPU_MODE
}; };
enum ggml_sycl_backend_device_filter {
SYCL_DEVICE_FILTER_ALL = 0,
SYCL_DEVICE_FILTER_TOP_LEVEL_ZERO
};
static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size"); static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
static void crash() { static void crash() {
@ -158,18 +163,18 @@ static size_t g_scratch_offset = 0;
int get_current_device_id(); int get_current_device_id();
inline dpct::err0 ggml_sycl_set_device(const int device) try { inline dpct::err0 ggml_sycl_set_device(const int device_id) try {
int current_device_id; int current_device_id;
SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id())); SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id()));
// GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d, GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d, current_device_id=%d\n", device_id, current_device_id);
// current_device_id=%d\n", device, current_device); if (device_id == current_device_id) {
if (device == current_device_id) {
return 0; return 0;
} }
return CHECK_TRY_ERROR(dpct::select_device(device)); return CHECK_TRY_ERROR(dpct::select_device(device_id));
} catch (sycl::exception const& exc) { } catch (sycl::exception const& exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ std::cerr << exc.what() << "Exception caught at file:" << __FILE__
<< ", line:" << __LINE__ << std::endl; << ", line:" << __LINE__ << std::endl;
@ -177,11 +182,142 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
std::exit(1); std::exit(1);
} }
////////////////////// class sycl_device_mgr {
public:
std::vector<int> device_ids;
std::vector<sycl::device> devices;
std::vector<int> max_compute_units;
std::vector<int> work_group_sizes;
sycl::queue *first_queue;
std::vector<sycl::queue *> queues;
std::vector<sycl::context> ctxs;
std::string device_list = "";
sycl_device_mgr(ggml_sycl_backend_device_filter device_filter) {
if (device_filter == SYCL_DEVICE_FILTER_TOP_LEVEL_ZERO) {
detect_sycl_gpu_list_with_max_cu();
create_context_for_group_gpus();
} else {
detect_all_sycl_device_list();
create_context_queue_for_devices();
}
get_allow_devices();
}
/*
Bind all gpus in same host with same context, for better performance in
device-to-device copy in the future.
*/
void create_context_for_group_gpus() {
sycl::context ctx = sycl::context(devices);
assert(device_ids.size() > 0);
first_queue = dpct::get_current_device().create_queue(ctx, devices[0]);
sycl::context ctx0 = first_queue->get_context();
for (int i = 0; i < device_ids.size(); i++) {
ctxs.push_back(ctx0);
dpct::select_device(device_ids[i]);
queues.push_back(
dpct::get_current_device().create_queue(ctx0, devices[i]));
}
}
void create_context_queue_for_devices() {
for (int i = 0; i < device_ids.size(); i++) {
sycl::context ctx = sycl::context(devices[i]);
ctxs.push_back(ctx);
dpct::select_device(device_ids[i]);
queues.push_back(
dpct::get_current_device().create_queue(ctx, devices[i]));
}
}
void get_allow_devices() {
device_list = "";
for (size_t i = 0; i < device_ids.size(); ++i) {
device_list += std::to_string(device_ids[i]);
device_list += ",";
}
if (device_list.length() > 1) {
device_list.pop_back();
}
}
bool is_allowed_device(int device_id) {
return std::find(device_ids.begin(), device_ids.end(), device_id) != device_ids.end();
}
void detect_all_sycl_device_list() try {
int device_count = dpct::dev_mgr::instance().device_count();
for (int id = 0; id < device_count; id++) {
sycl::device device = dpct::dev_mgr::instance().get_device(id);
device_ids.push_back(id);
devices.push_back(device);
dpct::device_info prop;
dpct::get_device_info(prop, device);
work_group_sizes.push_back(prop.get_max_work_group_size());
max_compute_units.push_back(prop.get_max_compute_units());
}
return;
} catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
<< ", line:" << __LINE__ << std::endl;
std::exit(1);
}
/*
Use all GPUs with same top max compute units
*/
void detect_sycl_gpu_list_with_max_cu() try {
int device_count = dpct::dev_mgr::instance().device_count();
int local_max_compute_units = 0;
for (int id = 0; id < device_count; id++) {
sycl::device device = dpct::dev_mgr::instance().get_device(id);
if (!device.is_gpu())
continue;
dpct::device_info prop;
dpct::get_device_info(prop, device);
if (local_max_compute_units < prop.get_max_compute_units())
local_max_compute_units = prop.get_max_compute_units();
}
for (int id = 0; id < device_count; id++) {
sycl::device device = dpct::dev_mgr::instance().get_device(id);
if (!device.is_gpu())
continue;
dpct::device_info prop;
dpct::get_device_info(prop, device);
if (local_max_compute_units == prop.get_max_compute_units() &&
is_ext_oneapi_device(device)) {
device_ids.push_back(id);
devices.push_back(device);
work_group_sizes.push_back(prop.get_max_work_group_size());
max_compute_units.push_back(prop.get_max_compute_units());
}
}
return;
} catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
<< ", line:" << __LINE__ << std::endl;
std::exit(1);
}
int get_device_count() { return (int)device_ids.size(); }
bool is_ext_oneapi_device(const sycl::device &dev) {
sycl::backend dev_backend = dev.get_backend();
if (dev_backend == sycl::backend::ext_oneapi_level_zero ||
dev_backend == sycl::backend::ext_oneapi_cuda ||
dev_backend == sycl::backend::ext_oneapi_hip)
return true;
return false;
}
};
struct ggml_sycl_device_info { struct ggml_sycl_device_info {
int device_count; int device_count;
int main_gpu_id = -1;
ggml_sycl_backend_gpu_mode use_gpu_mode = SYCL_MUL_GPU_MODE;
struct sycl_device_info { struct sycl_device_info {
int cc; // compute capability int cc; // compute capability
// int nsm; // number of streaming multiprocessors // int nsm; // number of streaming multiprocessors
@ -193,9 +329,71 @@ struct ggml_sycl_device_info {
sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {}; sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {}; std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
};
const ggml_sycl_device_info & ggml_sycl_info(); sycl_device_mgr *local_sycl_device_mgr = NULL;
void print_gpu_device_list() {
GGML_ASSERT(local_sycl_device_mgr);
char *hint = NULL;
if (use_gpu_mode == SYCL_MUL_GPU_MODE) {
hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n";
fprintf(stderr, hint, local_sycl_device_mgr->get_device_count(),
local_sycl_device_mgr->device_list.c_str(),
local_sycl_device_mgr->max_compute_units[main_gpu_id]);
} else {
hint = "use main device [%d] with Max compute units:%d\n";
fprintf(stderr, hint, main_gpu_id,
local_sycl_device_mgr->max_compute_units[main_gpu_id]);
}
}
int work_group_size(int device_id) {
GGML_ASSERT(local_sycl_device_mgr);
return local_sycl_device_mgr->work_group_sizes[device_id];
}
void refresh_device(ggml_sycl_backend_gpu_mode gpu_model,
int p_main_gpu_id = 0) {
main_gpu_id = p_main_gpu_id;
use_gpu_mode = gpu_model;
if (!local_sycl_device_mgr)
delete local_sycl_device_mgr;
if (use_gpu_mode == SYCL_MUL_GPU_MODE) {
local_sycl_device_mgr =
new sycl_device_mgr(SYCL_DEVICE_FILTER_TOP_LEVEL_ZERO);
} else {
GGML_ASSERT(main_gpu_id >= 0);
local_sycl_device_mgr = new sycl_device_mgr(SYCL_DEVICE_FILTER_ALL);
}
device_count = local_sycl_device_mgr->get_device_count();
int64_t total_vram = 0;
for (int i = 0; i < device_count; ++i) {
devices[i].vmm = 0;
dpct::device_info prop;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
prop, dpct::dev_mgr::instance().get_device(i))));
default_tensor_split[i] = total_vram;
total_vram += prop.get_global_mem_size();
devices[i].cc =
100 * prop.get_major_version() + 10 * prop.get_minor_version();
}
for (int id = 0; id < device_count; ++id) {
default_tensor_split[id] /= total_vram;
}
g_work_group_size = work_group_size(main_gpu_id);
print_gpu_device_list();
}
};
struct ggml_sycl_pool { struct ggml_sycl_pool {
virtual ~ggml_sycl_pool() = default; virtual ~ggml_sycl_pool() = default;
@ -262,16 +460,15 @@ struct ggml_backend_sycl_context {
queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } }; queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
explicit ggml_backend_sycl_context(int device) : explicit ggml_backend_sycl_context(struct ggml_sycl_device_info &sycl_device_info, int device) :
device(device), device(device),
name(GGML_SYCL_NAME + std::to_string(device)) { name(GGML_SYCL_NAME + std::to_string(device)) {
qptrs[device][0] = sycl_device_info.local_sycl_device_mgr->queues[device];
} }
queue_ptr stream(int device, int stream) { queue_ptr stream(int device, int stream) {
if (qptrs[device][stream] == nullptr) { assert(qptrs[device][0] != nullptr);
qptrs[device][stream] = &(dpct::get_current_device().default_queue()); return qptrs[device][0];
}
return qptrs[device][stream];
} }
queue_ptr stream() { queue_ptr stream() {

View file

@ -17619,6 +17619,7 @@ struct llama_context * llama_new_context_with_model(
#elif defined(GGML_USE_SYCL) #elif defined(GGML_USE_SYCL)
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
ggml_sycl_set_single_device(model->main_gpu);
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
if (backend == nullptr) { if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu); LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);