fix to support multiple GPUs, fix set single device, unify id/device_id/device_index
This commit is contained in:
parent
a9f3b10215
commit
de2763118f
4 changed files with 453 additions and 221 deletions
|
@ -34,6 +34,12 @@ GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *des
|
|||
GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
|
||||
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
||||
|
||||
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
|
||||
GGML_API GGML_CALL void ggml_sycl_set_single_device(int main_gpu_id);
|
||||
|
||||
// GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
|
||||
// GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
|
||||
|
||||
// SYCL doesn't support registering host memory, keep here for reference
|
||||
// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
|
||||
// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
|
||||
|
|
|
@ -51,6 +51,90 @@ static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer);
|
|||
static inline int get_sycl_env(const char *env_name, int default_val);
|
||||
static inline int get_work_group_size(const sycl::device& device);
|
||||
|
||||
static bool g_sycl_loaded = false;
|
||||
|
||||
bool ggml_sycl_loaded(void) {
|
||||
return g_sycl_loaded;
|
||||
}
|
||||
|
||||
int get_sycl_env(const char *env_name, int default_val) {
|
||||
char *user_device_string = getenv(env_name);
|
||||
int user_number = default_val;
|
||||
|
||||
unsigned n;
|
||||
if (user_device_string != NULL &&
|
||||
sscanf(user_device_string, " %u", &n) == 1) {
|
||||
user_number = (int)n;
|
||||
} else {
|
||||
user_number = default_val;
|
||||
}
|
||||
return user_number;
|
||||
}
|
||||
|
||||
static ggml_sycl_device_info ggml_sycl_init() try {
|
||||
static bool initialized = false;
|
||||
|
||||
if (!initialized) {
|
||||
fprintf(stderr, "[SYCL] call ggml_init_sycl\n");
|
||||
|
||||
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
||||
fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__,
|
||||
g_ggml_sycl_debug);
|
||||
|
||||
#if defined(GGML_SYCL_F16)
|
||||
fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__);
|
||||
#else
|
||||
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
|
||||
#endif
|
||||
|
||||
#if defined(GGML_SYCL_FORCE_MMQ)
|
||||
fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: yes\n", __func__);
|
||||
#else
|
||||
fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: no\n", __func__);
|
||||
#endif
|
||||
|
||||
#if defined(SYCL_USE_XMX)
|
||||
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
|
||||
#else
|
||||
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
|
||||
#endif
|
||||
|
||||
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
|
||||
dpct::dev_mgr::instance().device_count()) !=
|
||||
0) {
|
||||
initialized = true;
|
||||
g_sycl_loaded = false;
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
|
||||
ggml_backend_sycl_print_sycl_devices();
|
||||
initialized = true;
|
||||
g_sycl_loaded = true;
|
||||
}
|
||||
|
||||
static ggml_sycl_device_info info = {};
|
||||
info.refresh_device(SYCL_MUL_GPU_MODE);
|
||||
|
||||
if (info.device_count == 0) {
|
||||
fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": %s\n",
|
||||
__func__);
|
||||
return info;
|
||||
}
|
||||
GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES);
|
||||
|
||||
return info;
|
||||
} catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
|
||||
ggml_sycl_device_info &ggml_sycl_info() {
|
||||
static ggml_sycl_device_info info = ggml_sycl_init();
|
||||
return info;
|
||||
}
|
||||
|
||||
void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
|
||||
const void *ptr_src, size_t size) {
|
||||
char *host_buf = (char *)malloc(size);
|
||||
|
@ -2016,12 +2100,6 @@ static void im2col_sycl(const float *x, T *dst, int IW, int IH,
|
|||
}
|
||||
|
||||
|
||||
static bool g_sycl_loaded = false;
|
||||
|
||||
bool ggml_sycl_loaded(void) {
|
||||
return g_sycl_loaded;
|
||||
}
|
||||
|
||||
void print_device_detail(int id, sycl::device &device, std::string device_type) {
|
||||
|
||||
dpct::device_info prop;
|
||||
|
@ -2121,67 +2199,20 @@ catch (sycl::exception const &exc) {
|
|||
std::exit(1);
|
||||
}
|
||||
|
||||
static ggml_sycl_device_info ggml_sycl_init() {
|
||||
ggml_sycl_device_info info = {};
|
||||
|
||||
info.device_count = dpct::dev_mgr::instance().device_count();
|
||||
if (info.device_count == 0) {
|
||||
fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__);
|
||||
return info;
|
||||
}
|
||||
|
||||
GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES);
|
||||
|
||||
int64_t total_vram = 0;
|
||||
#if defined(GGML_SYCL_FORCE_MMQ)
|
||||
fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: yes\n", __func__);
|
||||
#else
|
||||
fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: no\n", __func__);
|
||||
#endif
|
||||
#if defined(SYCL_USE_XMX)
|
||||
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
|
||||
#else
|
||||
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
|
||||
#endif
|
||||
fprintf(stderr, "%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count);
|
||||
|
||||
for (int i = 0; i < info.device_count; ++i) {
|
||||
info.devices[i].vmm = 0;
|
||||
dpct::device_info prop;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
||||
prop, dpct::dev_mgr::instance().get_device(i))));
|
||||
|
||||
info.default_tensor_split[i] = total_vram;
|
||||
total_vram += prop.get_global_mem_size();
|
||||
|
||||
info.devices[i].cc =
|
||||
100 * prop.get_major_version() + 10 * prop.get_minor_version();
|
||||
}
|
||||
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
info.default_tensor_split[id] /= total_vram;
|
||||
}
|
||||
return info;
|
||||
GGML_API GGML_CALL void ggml_sycl_set_single_device(int device_id) {
|
||||
ggml_sycl_info().refresh_device(SYCL_SINGLE_GPU_MODE, device_id);
|
||||
ggml_sycl_set_main_device(device_id);
|
||||
}
|
||||
|
||||
const ggml_sycl_device_info & ggml_sycl_info() {
|
||||
static ggml_sycl_device_info info = ggml_sycl_init();
|
||||
return info;
|
||||
}
|
||||
|
||||
/*
|
||||
device_index: device index from 0 to n (continue numbers).
|
||||
It is used for device select/set in SYCL backend internal data structure.
|
||||
*/
|
||||
inline void check_allow_gpu_index(const int device_index) {
|
||||
if (device_index >= ggml_sycl_info().device_count) {
|
||||
inline void check_allow_device_id(const int device_id) {
|
||||
if (device_id >= ggml_sycl_info().device_count) {
|
||||
char error_buf[256];
|
||||
snprintf(
|
||||
error_buf,
|
||||
sizeof(error_buf),
|
||||
"%s error: device_index:%d is out of range: [0-%d]",
|
||||
"%s error: device_id:%d is out of range: [0-%d]",
|
||||
__func__,
|
||||
device_index,
|
||||
device_id,
|
||||
ggml_sycl_info().device_count - 1);
|
||||
fprintf(stderr, "%s\n", error_buf);
|
||||
assert(false);
|
||||
|
@ -2192,7 +2223,7 @@ inline void check_allow_gpu_index(const int device_index) {
|
|||
struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
||||
static const int MAX_SYCL_BUFFERS = 256;
|
||||
|
||||
int device;
|
||||
int device_id;
|
||||
queue_ptr qptr;
|
||||
struct ggml_sycl_buffer {
|
||||
void * ptr = nullptr;
|
||||
|
@ -2204,7 +2235,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
|||
|
||||
explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) :
|
||||
qptr(qptr_),
|
||||
device(device_) {
|
||||
device_id(device_) {
|
||||
}
|
||||
|
||||
~ggml_sycl_pool_leg() {
|
||||
|
@ -2288,12 +2319,12 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
|||
}
|
||||
};
|
||||
|
||||
std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
|
||||
std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device_id) {
|
||||
// TBD: NO VMM support
|
||||
// if (ggml_sycl_info().devices[device].vmm) {
|
||||
// return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(device));
|
||||
// if (ggml_sycl_info().devices[device_id].vmm) {
|
||||
// return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(device_id));
|
||||
// }
|
||||
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
|
||||
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device_id));
|
||||
}
|
||||
|
||||
// TBD pool with virtual memory management
|
||||
|
@ -2704,13 +2735,13 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|||
static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split) {
|
||||
int64_t min_compute_capability = INT_MAX;
|
||||
int64_t max_compute_capability = INT_MIN;
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
if (tensor_split[i] < (i + 1 < ggml_sycl_info().device_count ? tensor_split[i + 1] : 1.0f)) {
|
||||
if (min_compute_capability > ggml_sycl_info().devices[i].cc) {
|
||||
min_compute_capability = ggml_sycl_info().devices[i].cc;
|
||||
for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
|
||||
if (tensor_split[id] < (id + 1 < ggml_sycl_info().device_count ? tensor_split[id + 1] : 1.0f)) {
|
||||
if (min_compute_capability > ggml_sycl_info().devices[id].cc) {
|
||||
min_compute_capability = ggml_sycl_info().devices[id].cc;
|
||||
}
|
||||
if (max_compute_capability < ggml_sycl_info().devices[i].cc) {
|
||||
max_compute_capability = ggml_sycl_info().devices[i].cc;
|
||||
if (max_compute_capability < ggml_sycl_info().devices[id].cc) {
|
||||
max_compute_capability = ggml_sycl_info().devices[id].cc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3132,18 +3163,18 @@ static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
|
|||
}
|
||||
|
||||
#ifdef NDEBUG
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
SYCL_CHECK(ggml_sycl_set_device(i));
|
||||
for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
|
||||
SYCL_CHECK(ggml_sycl_set_device(id));
|
||||
}
|
||||
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
SYCL_CHECK(ggml_sycl_set_device(i));
|
||||
for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
|
||||
SYCL_CHECK(ggml_sycl_set_device(id));
|
||||
|
||||
for (int id_other = 0; id_other < ggml_sycl_info().device_count; ++id_other) {
|
||||
if (i == id_other) {
|
||||
if (id == id_other) {
|
||||
continue;
|
||||
}
|
||||
if (i != main_device && id_other != main_device) {
|
||||
if (id != main_device && id_other != main_device) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -3240,62 +3271,62 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
int used_devices = 0;
|
||||
queue_ptr main_stream = ctx.stream();
|
||||
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
|
||||
// by default, use all rows
|
||||
dev[i].row_low = 0;
|
||||
dev[i].row_high = ne01;
|
||||
dev[id].row_low = 0;
|
||||
dev[id].row_high = ne01;
|
||||
|
||||
// for multi GPU, get the row boundaries from tensor split
|
||||
// and round to mul_mat_q tile sizes
|
||||
if (split) {
|
||||
const int64_t rounding = get_row_rounding(src0->type, tensor_split);
|
||||
|
||||
if (i != 0) {
|
||||
dev[i].row_low = ne01*tensor_split[i];
|
||||
if (dev[i].row_low < ne01) {
|
||||
dev[i].row_low -= dev[i].row_low % rounding;
|
||||
if (id != 0) {
|
||||
dev[id].row_low = ne01*tensor_split[id];
|
||||
if (dev[id].row_low < ne01) {
|
||||
dev[id].row_low -= dev[id].row_low % rounding;
|
||||
}
|
||||
}
|
||||
|
||||
if (i != ggml_sycl_info().device_count - 1) {
|
||||
dev[i].row_high = ne01*tensor_split[i + 1];
|
||||
if (dev[i].row_high < ne01) {
|
||||
dev[i].row_high -= dev[i].row_high % rounding;
|
||||
if (id != ggml_sycl_info().device_count - 1) {
|
||||
dev[id].row_high = ne01*tensor_split[id + 1];
|
||||
if (dev[id].row_high < ne01) {
|
||||
dev[id].row_high -= dev[id].row_high % rounding;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) {
|
||||
for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
|
||||
if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
|
||||
continue;
|
||||
}
|
||||
|
||||
used_devices++;
|
||||
|
||||
const bool src1_on_device = i == ctx.device;
|
||||
const bool dst_on_device = i == ctx.device;
|
||||
const bool src1_on_device = id == ctx.device;
|
||||
const bool dst_on_device = id == ctx.device;
|
||||
|
||||
ggml_sycl_set_device(i);
|
||||
queue_ptr stream = ctx.stream(i, 0);
|
||||
ggml_sycl_set_device(id);
|
||||
queue_ptr stream = ctx.stream(id, 0);
|
||||
|
||||
if (src0_is_contiguous) {
|
||||
dev[i].src0_dd = (char *) src0->data;
|
||||
dev[id].src0_dd = (char *) src0->data;
|
||||
} else {
|
||||
dev[i].src0_dd = dev[i].src0_dd_alloc.alloc(ctx.pool(i), ggml_nbytes(src0));
|
||||
dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
|
||||
}
|
||||
|
||||
if (src1_on_device && src1_is_contiguous) {
|
||||
dev[i].src1_ddf = (float *) src1->data;
|
||||
dev[id].src1_ddf = (float *) src1->data;
|
||||
} else {
|
||||
dev[i].src1_ddf = dev[i].src1_ddf_alloc.alloc(ctx.pool(i), ggml_nelements(src1));
|
||||
dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
|
||||
}
|
||||
|
||||
if (convert_src1_to_q8_1) {
|
||||
dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
|
||||
dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
|
||||
|
||||
if (src1_on_device && src1_is_contiguous) {
|
||||
quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
|
||||
quantize_row_q8_1_sycl(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
|
||||
/*
|
||||
DPCT1010:90: SYCL uses exceptions to report errors and does not
|
||||
use the error codes. The call was replaced with 0. You need to
|
||||
|
@ -3306,10 +3337,10 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
}
|
||||
|
||||
if (dst_on_device) {
|
||||
dev[i].dst_dd = (float *) dst->data;
|
||||
dev[id].dst_dd = (float *) dst->data;
|
||||
} else {
|
||||
const size_t size_dst_ddf = split ? (dev[i].row_high - dev[i].row_low)*ne1 : ggml_nelements(dst);
|
||||
dev[i].dst_dd = dev[i].dst_dd_alloc.alloc(ctx.pool(i), size_dst_ddf);
|
||||
const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
|
||||
dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(ctx.pool(id), size_dst_ddf);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3332,20 +3363,20 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_SYCL_MAX_STREAMS : 0;
|
||||
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
||||
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) {
|
||||
for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
|
||||
if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const bool src1_on_device = i == ctx.device;
|
||||
const bool dst_on_device = i == ctx.device;
|
||||
const int64_t row_diff = dev[i].row_high - dev[i].row_low;
|
||||
const bool src1_on_device = id == ctx.device;
|
||||
const bool dst_on_device = id == ctx.device;
|
||||
const int64_t row_diff = dev[id].row_high - dev[id].row_low;
|
||||
|
||||
ggml_sycl_set_device(i);
|
||||
queue_ptr stream = ctx.stream(i, is);
|
||||
ggml_sycl_set_device(id);
|
||||
queue_ptr stream = ctx.stream(id, is);
|
||||
|
||||
// wait for main GPU data if necessary
|
||||
if (split && (i != ctx.device || is != 0)) {
|
||||
if (split && (id != ctx.device || is != 0)) {
|
||||
/*
|
||||
DPCT1009:163: SYCL uses exceptions to report errors and does not
|
||||
use the error codes. The original code was commented out and a
|
||||
|
@ -3362,20 +3393,20 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
||||
|
||||
// for split tensors the data begins at i0 == i0_offset_low
|
||||
char * src0_dd_i = dev[i].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
|
||||
float * src1_ddf_i = dev[i].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
|
||||
char * src1_ddq_i = dev[i].src1_ddq + src1_ddq_i_offset;
|
||||
float * dst_dd_i = dev[i].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
|
||||
char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
|
||||
float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
|
||||
char * src1_ddq_i = dev[id].src1_ddq + src1_ddq_i_offset;
|
||||
float * dst_dd_i = dev[id].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
|
||||
|
||||
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
||||
// in that case an offset on dst_ddf_i is needed
|
||||
if (i == ctx.device) {
|
||||
dst_dd_i += dev[i].row_low; // offset is 0 if no tensor split
|
||||
if (id == ctx.device) {
|
||||
dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
|
||||
}
|
||||
|
||||
// copy src0, src1 to device if necessary
|
||||
if (src1_is_contiguous) {
|
||||
if (i != ctx.device) {
|
||||
if (id != ctx.device) {
|
||||
if (convert_src1_to_q8_1) {
|
||||
char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
|
||||
|
@ -3410,14 +3441,14 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
}
|
||||
|
||||
if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
|
||||
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[i].row_low, dev[i].row_high, stream));
|
||||
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
|
||||
}
|
||||
if (src1->type == GGML_TYPE_F16) {
|
||||
src1_padded_col_size = (i0 * ne11 + src1_col_0) * ne10;
|
||||
}
|
||||
// do the computation
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
|
||||
dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream)));
|
||||
dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream)));
|
||||
/*
|
||||
DPCT1010:93: SYCL uses exceptions to report errors and does not
|
||||
use the error codes. The call was replaced with 0. You need to
|
||||
|
@ -3436,7 +3467,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
|
||||
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
||||
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
||||
dhf_dst_i += src1_col_0*ne0 + dev[i].row_low;
|
||||
dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
|
||||
dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
|
||||
|
@ -3453,7 +3484,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
}
|
||||
|
||||
// add event for the main device to wait on until other device is done
|
||||
if (split && (i != ctx.device || is != 0)) {
|
||||
if (split && (id != ctx.device || is != 0)) {
|
||||
/*
|
||||
DPCT1024:94: The original code returned the error code that
|
||||
was further consumed by the program logic. This original
|
||||
|
@ -3461,7 +3492,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
program logic consuming the error code.
|
||||
*/
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
*src0_extra->events[i][is] =
|
||||
*src0_extra->events[id][is] =
|
||||
stream->ext_oneapi_submit_barrier()));
|
||||
}
|
||||
}
|
||||
|
@ -3474,14 +3505,14 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
is_max = is_max <= GGML_SYCL_MAX_STREAMS ? is_max : GGML_SYCL_MAX_STREAMS;
|
||||
|
||||
ggml_sycl_set_device(ctx.device);
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
if (dev[i].row_low == dev[i].row_high) {
|
||||
for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
|
||||
if (dev[id].row_low == dev[id].row_high) {
|
||||
continue;
|
||||
}
|
||||
for (int64_t is = 0; is < is_max; ++is) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
ctx.stream()->ext_oneapi_submit_barrier(
|
||||
{*src0_extra->events[i][is]})));
|
||||
{*src0_extra->events[id][is]})));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -4310,17 +4341,17 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
|
|||
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
||||
}
|
||||
|
||||
void ggml_sycl_set_main_device(const int main_device) try {
|
||||
if (dpct::get_current_device_id() == main_device) return;
|
||||
check_allow_gpu_index(main_device);
|
||||
dpct::select_device(main_device);
|
||||
void ggml_sycl_set_main_device(const int main_device_id) try {
|
||||
if (dpct::get_current_device_id() == main_device_id) return;
|
||||
check_allow_device_id(main_device_id);
|
||||
dpct::select_device(main_device_id);
|
||||
|
||||
if (g_ggml_sycl_debug) {
|
||||
dpct::device_info prop;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
||||
prop, dpct::dev_mgr::instance().get_device(main_device))));
|
||||
prop, dpct::dev_mgr::instance().get_device(main_device_id))));
|
||||
fprintf(stderr, "Using device %d (%s) as main device\n",
|
||||
main_device, prop.get_name());
|
||||
main_device_id, prop.get_name());
|
||||
}
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
|
@ -4475,9 +4506,9 @@ GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try
|
|||
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
|
||||
for(int i=0;i<max_len;i++) id_list[i] = -1;
|
||||
|
||||
for (int i=0;i< ggml_sycl_info().device_count;i++){
|
||||
if (i>=max_len) break;
|
||||
id_list[i] = i;
|
||||
for (int id=0;id< ggml_sycl_info().device_count;id++){
|
||||
if (id>=max_len) break;
|
||||
id_list[id] = id;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -4501,12 +4532,12 @@ catch (sycl::exception const &exc) {
|
|||
std::exit(1);
|
||||
}
|
||||
|
||||
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
|
||||
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device_id, char *description,
|
||||
size_t description_size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
|
||||
dpct::device_info prop;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
||||
prop, dpct::dev_mgr::instance().get_device(device))));
|
||||
prop, dpct::dev_mgr::instance().get_device(device_id))));
|
||||
snprintf(description, description_size, "%s", prop.get_name());
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
|
@ -4515,10 +4546,10 @@ catch (sycl::exception const &exc) {
|
|||
std::exit(1);
|
||||
}
|
||||
|
||||
GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
|
||||
GGML_CALL void ggml_backend_sycl_get_device_memory(int device_id, size_t *free,
|
||||
size_t *total) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
|
||||
ggml_sycl_set_device(device);
|
||||
ggml_sycl_set_device(device_id);
|
||||
|
||||
/*
|
||||
DPCT1009:218: SYCL uses exceptions to report errors and does not use the
|
||||
|
@ -4531,7 +4562,7 @@ GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
|
|||
You may need to adjust the code.
|
||||
*/
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
dpct::dev_mgr::instance().get_device(device).get_memory_info(*free, *total)));
|
||||
dpct::dev_mgr::instance().get_device(device_id).get_memory_info(*free, *total)));
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
|
@ -4553,9 +4584,9 @@ struct ggml_backend_sycl_buffer_context {
|
|||
queue_ptr stream;
|
||||
std::string name;
|
||||
|
||||
ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) :
|
||||
device(device), dev_ptr(dev_ptr), stream(stream) {
|
||||
check_allow_gpu_index(device);
|
||||
ggml_backend_sycl_buffer_context(int device_id, void * dev_ptr, queue_ptr stream) :
|
||||
device(device_id), dev_ptr(dev_ptr), stream(stream) {
|
||||
check_allow_device_id(device);
|
||||
name = (GGML_SYCL_NAME + std::to_string(device));
|
||||
}
|
||||
|
||||
|
@ -4831,58 +4862,58 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
|||
/* .is_host = */ nullptr,
|
||||
};
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
||||
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
|
||||
|
||||
if (device>=ggml_sycl_info().device_count or device<0) {
|
||||
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
|
||||
device, ggml_sycl_info().device_count-1);
|
||||
GGML_ASSERT(device<ggml_sycl_info().device_count);
|
||||
if (device_id>=ggml_sycl_info().device_count or device_id<0) {
|
||||
printf("ggml_backend_sycl_buffer_type error: device_id:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
|
||||
device_id, ggml_sycl_info().device_count-1);
|
||||
GGML_ASSERT(device_id<ggml_sycl_info().device_count);
|
||||
}
|
||||
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
|
||||
|
||||
static bool ggml_backend_sycl_buffer_type_initialized = false;
|
||||
|
||||
if (!ggml_backend_sycl_buffer_type_initialized) {
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
|
||||
auto & device_i = dpct::dev_mgr::instance().get_device(i);
|
||||
queue_ptr stream = &(device_i.default_queue());
|
||||
ggml_backend_sycl_buffer_types[i] = {
|
||||
for (int id = 0; id < ggml_sycl_info().device_count; id++) {
|
||||
auto & device = dpct::dev_mgr::instance().get_device(id);
|
||||
queue_ptr stream = &(device.default_queue());
|
||||
ggml_backend_sycl_buffer_types[id] = {
|
||||
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
||||
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), stream},
|
||||
/* .context = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), stream},
|
||||
};
|
||||
}
|
||||
ggml_backend_sycl_buffer_type_initialized = true;
|
||||
}
|
||||
return &ggml_backend_sycl_buffer_types[device];
|
||||
return &ggml_backend_sycl_buffer_types[device_id];
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
|
||||
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
|
||||
|
||||
int device = ctx->device;
|
||||
if (device>=ggml_sycl_info().device_count or device<0) {
|
||||
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
|
||||
device, ggml_sycl_info().device_count-1);
|
||||
GGML_ASSERT(device<ggml_sycl_info().device_count);
|
||||
int device_id = ctx->device;
|
||||
if (device_id>=ggml_sycl_info().device_count or device_id<0) {
|
||||
printf("ggml_backend_sycl_buffer_type error: device_id:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
|
||||
device_id, ggml_sycl_info().device_count-1);
|
||||
GGML_ASSERT(device_id<ggml_sycl_info().device_count);
|
||||
}
|
||||
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
|
||||
|
||||
static bool ggml_backend_sycl_buffer_type_initialized = false;
|
||||
|
||||
if (!ggml_backend_sycl_buffer_type_initialized) {
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
|
||||
ggml_backend_sycl_buffer_types[i] = {
|
||||
for (int id = 0; id < ggml_sycl_info().device_count; id++) {
|
||||
ggml_backend_sycl_buffer_types[id] = {
|
||||
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
||||
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), ctx->stream(i, 0)},
|
||||
/* .context = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), ctx->stream(id, 0)},
|
||||
};
|
||||
}
|
||||
ggml_backend_sycl_buffer_type_initialized = true;
|
||||
}
|
||||
return &ggml_backend_sycl_buffer_types[device];
|
||||
return &ggml_backend_sycl_buffer_types[device_id];
|
||||
}
|
||||
|
||||
// sycl split buffer type
|
||||
|
@ -4903,9 +4934,9 @@ static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tens
|
|||
struct ggml_backend_sycl_split_buffer_context {
|
||||
~ggml_backend_sycl_split_buffer_context() try {
|
||||
for (ggml_tensor_extra_gpu * extra : tensor_extras) {
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
|
||||
for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
|
||||
if (extra->events[i][is] != nullptr) {
|
||||
if (extra->events[id][is] != nullptr) {
|
||||
/*
|
||||
DPCT1009:206: SYCL uses exceptions to report errors and
|
||||
does not use the error codes. The original code was
|
||||
|
@ -4913,19 +4944,19 @@ struct ggml_backend_sycl_split_buffer_context {
|
|||
need to rewrite this code.
|
||||
*/
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
dpct::destroy_event(extra->events[i][is])));
|
||||
dpct::destroy_event(extra->events[id][is])));
|
||||
}
|
||||
}
|
||||
if (extra->data_device[i] != nullptr) {
|
||||
if (extra->data_device[id] != nullptr) {
|
||||
/*
|
||||
DPCT1009:207: SYCL uses exceptions to report errors and does
|
||||
not use the error codes. The original code was commented out
|
||||
and a warning string was inserted. You need to rewrite this
|
||||
code.
|
||||
*/
|
||||
ggml_sycl_set_device(i);
|
||||
ggml_sycl_set_device(id);
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(
|
||||
extra->data_device[i], *(streams[i]))));
|
||||
extra->data_device[id], *(streams[id]))));
|
||||
}
|
||||
}
|
||||
delete extra;
|
||||
|
@ -4978,9 +5009,9 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|||
ctx->tensor_extras.push_back(extra);
|
||||
ctx->streams.push_back(&(dpct::get_current_device().default_queue()));
|
||||
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
|
||||
int64_t row_low, row_high;
|
||||
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);
|
||||
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
||||
|
||||
int64_t nrows_split = row_high - row_low;
|
||||
if (nrows_split == 0) {
|
||||
|
@ -4997,8 +5028,8 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|||
|
||||
// FIXME: do not crash if cudaMalloc fails
|
||||
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
||||
ggml_sycl_set_device(i);
|
||||
const queue_ptr stream = ctx->streams[i];
|
||||
ggml_sycl_set_device(id);
|
||||
const queue_ptr stream = ctx->streams[id];
|
||||
char * buf;
|
||||
/*
|
||||
DPCT1009:208: SYCL uses exceptions to report errors and does not use the
|
||||
|
@ -5021,7 +5052,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|||
.wait()));
|
||||
}
|
||||
|
||||
extra->data_device[i] = buf;
|
||||
extra->data_device[id] = buf;
|
||||
|
||||
for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
|
||||
/*
|
||||
|
@ -5030,7 +5061,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|||
string was inserted. You need to rewrite this code.
|
||||
*/
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event()));
|
||||
CHECK_TRY_ERROR(extra->events[id][is] = new sycl::event()));
|
||||
}
|
||||
}
|
||||
tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||
|
@ -5057,9 +5088,9 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|||
const size_t nb1 = tensor->nb[1];
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
|
||||
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
|
||||
int64_t row_low, row_high;
|
||||
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);
|
||||
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
||||
|
||||
int64_t nrows_split = row_high - row_low;
|
||||
if (nrows_split == 0) {
|
||||
|
@ -5081,11 +5112,11 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|||
error codes. The original code was commented out and a warning string
|
||||
was inserted. You need to rewrite this code.
|
||||
*/
|
||||
ggml_sycl_set_device(i);
|
||||
const queue_ptr stream = ctx->streams[i];
|
||||
ggml_sycl_set_device(id);
|
||||
const queue_ptr stream = ctx->streams[id];
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
(*stream)
|
||||
.memcpy(extra->data_device[i], buf_host, original_size)
|
||||
.memcpy(extra->data_device[id], buf_host, original_size)
|
||||
.wait()));
|
||||
}
|
||||
}
|
||||
|
@ -5110,9 +5141,9 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|||
const size_t nb1 = tensor->nb[1];
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
|
||||
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
|
||||
int64_t row_low, row_high;
|
||||
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);
|
||||
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
||||
|
||||
int64_t nrows_split = row_high - row_low;
|
||||
if (nrows_split == 0) {
|
||||
|
@ -5134,11 +5165,11 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|||
error codes. The original code was commented out and a warning string
|
||||
was inserted. You need to rewrite this code.
|
||||
*/
|
||||
ggml_sycl_set_device(i);
|
||||
const queue_ptr stream = ctx->streams[i];
|
||||
ggml_sycl_set_device(id);
|
||||
const queue_ptr stream = ctx->streams[id];
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
(*stream)
|
||||
.memcpy(buf_host, extra->data_device[i], original_size)
|
||||
.memcpy(buf_host, extra->data_device[id], original_size)
|
||||
.wait()));
|
||||
}
|
||||
}
|
||||
|
@ -5193,9 +5224,9 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_
|
|||
|
||||
const int64_t ne0 = tensor->ne[0];
|
||||
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
|
||||
int64_t row_low, row_high;
|
||||
get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, i);
|
||||
get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
|
||||
|
||||
int64_t nrows_split = row_high - row_low;
|
||||
if (nrows_split == 0) {
|
||||
|
@ -5244,12 +5275,12 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const f
|
|||
tensor_split_arr = ggml_sycl_info().default_tensor_split;
|
||||
} else {
|
||||
float split_sum = 0.0f;
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
tensor_split_arr[i] = split_sum;
|
||||
split_sum += tensor_split[i];
|
||||
for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
|
||||
tensor_split_arr[id] = split_sum;
|
||||
split_sum += tensor_split[id];
|
||||
}
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
tensor_split_arr[i] /= split_sum;
|
||||
for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
|
||||
tensor_split_arr[id] /= split_sum;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -5613,13 +5644,10 @@ static ggml_guid_t ggml_backend_sycl_guid() {
|
|||
return &guid;
|
||||
}
|
||||
|
||||
GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
|
||||
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
|
||||
ggml_check_sycl();
|
||||
|
||||
check_allow_gpu_index(device);
|
||||
|
||||
ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context(device);
|
||||
GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device_id) {
|
||||
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init, device_id=%d\n", device_id);
|
||||
check_allow_device_id(device_id);
|
||||
ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context(ggml_sycl_info(), device_id);
|
||||
if (ctx == nullptr) {
|
||||
fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
|
||||
return nullptr;
|
||||
|
@ -5654,10 +5682,10 @@ extern "C" int ggml_backend_sycl_reg_devices();
|
|||
|
||||
int ggml_backend_sycl_reg_devices() {
|
||||
assert(ggml_sycl_info().device_count>0);
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
|
||||
for (int id = 0; id < ggml_sycl_info().device_count; id++) {
|
||||
char name[128];
|
||||
snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, i);
|
||||
ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(i), (void *) (intptr_t) i);
|
||||
snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, id);
|
||||
ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(id), (void *) (intptr_t) id);
|
||||
}
|
||||
return ggml_sycl_info().device_count;
|
||||
}
|
||||
|
|
|
@ -49,7 +49,7 @@ static int g_ggml_sycl_debug = 0;
|
|||
|
||||
// #define DEBUG_SYCL_MALLOC
|
||||
|
||||
static int g_work_group_size = 0;
|
||||
static int g_work_group_size = -1;
|
||||
// typedef sycl::half ggml_fp16_t;
|
||||
|
||||
#define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
|
||||
|
@ -89,6 +89,11 @@ enum ggml_sycl_backend_gpu_mode {
|
|||
SYCL_MUL_GPU_MODE
|
||||
};
|
||||
|
||||
enum ggml_sycl_backend_device_filter {
|
||||
SYCL_DEVICE_FILTER_ALL = 0,
|
||||
SYCL_DEVICE_FILTER_TOP_LEVEL_ZERO
|
||||
};
|
||||
|
||||
static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||
|
||||
static void crash() {
|
||||
|
@ -158,18 +163,18 @@ static size_t g_scratch_offset = 0;
|
|||
|
||||
int get_current_device_id();
|
||||
|
||||
inline dpct::err0 ggml_sycl_set_device(const int device) try {
|
||||
inline dpct::err0 ggml_sycl_set_device(const int device_id) try {
|
||||
|
||||
int current_device_id;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id()));
|
||||
|
||||
// GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d,
|
||||
// current_device_id=%d\n", device, current_device);
|
||||
if (device == current_device_id) {
|
||||
GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d, current_device_id=%d\n", device_id, current_device_id);
|
||||
if (device_id == current_device_id) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return CHECK_TRY_ERROR(dpct::select_device(device));
|
||||
return CHECK_TRY_ERROR(dpct::select_device(device_id));
|
||||
|
||||
} catch (sycl::exception const& exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
|
@ -177,25 +182,218 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
|
|||
std::exit(1);
|
||||
}
|
||||
|
||||
//////////////////////
|
||||
class sycl_device_mgr {
|
||||
public:
|
||||
std::vector<int> device_ids;
|
||||
std::vector<sycl::device> devices;
|
||||
std::vector<int> max_compute_units;
|
||||
std::vector<int> work_group_sizes;
|
||||
sycl::queue *first_queue;
|
||||
std::vector<sycl::queue *> queues;
|
||||
std::vector<sycl::context> ctxs;
|
||||
std::string device_list = "";
|
||||
|
||||
sycl_device_mgr(ggml_sycl_backend_device_filter device_filter) {
|
||||
if (device_filter == SYCL_DEVICE_FILTER_TOP_LEVEL_ZERO) {
|
||||
detect_sycl_gpu_list_with_max_cu();
|
||||
create_context_for_group_gpus();
|
||||
} else {
|
||||
detect_all_sycl_device_list();
|
||||
create_context_queue_for_devices();
|
||||
}
|
||||
get_allow_devices();
|
||||
}
|
||||
|
||||
/*
|
||||
Bind all gpus in same host with same context, for better performance in
|
||||
device-to-device copy in the future.
|
||||
*/
|
||||
void create_context_for_group_gpus() {
|
||||
sycl::context ctx = sycl::context(devices);
|
||||
assert(device_ids.size() > 0);
|
||||
first_queue = dpct::get_current_device().create_queue(ctx, devices[0]);
|
||||
sycl::context ctx0 = first_queue->get_context();
|
||||
for (int i = 0; i < device_ids.size(); i++) {
|
||||
ctxs.push_back(ctx0);
|
||||
dpct::select_device(device_ids[i]);
|
||||
queues.push_back(
|
||||
dpct::get_current_device().create_queue(ctx0, devices[i]));
|
||||
}
|
||||
}
|
||||
|
||||
void create_context_queue_for_devices() {
|
||||
for (int i = 0; i < device_ids.size(); i++) {
|
||||
sycl::context ctx = sycl::context(devices[i]);
|
||||
ctxs.push_back(ctx);
|
||||
dpct::select_device(device_ids[i]);
|
||||
queues.push_back(
|
||||
dpct::get_current_device().create_queue(ctx, devices[i]));
|
||||
}
|
||||
}
|
||||
|
||||
void get_allow_devices() {
|
||||
device_list = "";
|
||||
for (size_t i = 0; i < device_ids.size(); ++i) {
|
||||
device_list += std::to_string(device_ids[i]);
|
||||
device_list += ",";
|
||||
}
|
||||
if (device_list.length() > 1) {
|
||||
device_list.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
bool is_allowed_device(int device_id) {
|
||||
return std::find(device_ids.begin(), device_ids.end(), device_id) != device_ids.end();
|
||||
}
|
||||
|
||||
void detect_all_sycl_device_list() try {
|
||||
int device_count = dpct::dev_mgr::instance().device_count();
|
||||
|
||||
for (int id = 0; id < device_count; id++) {
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||
device_ids.push_back(id);
|
||||
devices.push_back(device);
|
||||
dpct::device_info prop;
|
||||
dpct::get_device_info(prop, device);
|
||||
work_group_sizes.push_back(prop.get_max_work_group_size());
|
||||
max_compute_units.push_back(prop.get_max_compute_units());
|
||||
}
|
||||
return;
|
||||
} catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
Use all GPUs with same top max compute units
|
||||
*/
|
||||
void detect_sycl_gpu_list_with_max_cu() try {
|
||||
int device_count = dpct::dev_mgr::instance().device_count();
|
||||
int local_max_compute_units = 0;
|
||||
for (int id = 0; id < device_count; id++) {
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||
if (!device.is_gpu())
|
||||
continue;
|
||||
dpct::device_info prop;
|
||||
dpct::get_device_info(prop, device);
|
||||
if (local_max_compute_units < prop.get_max_compute_units())
|
||||
local_max_compute_units = prop.get_max_compute_units();
|
||||
}
|
||||
|
||||
for (int id = 0; id < device_count; id++) {
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||
if (!device.is_gpu())
|
||||
continue;
|
||||
dpct::device_info prop;
|
||||
dpct::get_device_info(prop, device);
|
||||
if (local_max_compute_units == prop.get_max_compute_units() &&
|
||||
is_ext_oneapi_device(device)) {
|
||||
device_ids.push_back(id);
|
||||
devices.push_back(device);
|
||||
work_group_sizes.push_back(prop.get_max_work_group_size());
|
||||
max_compute_units.push_back(prop.get_max_compute_units());
|
||||
}
|
||||
}
|
||||
return;
|
||||
} catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
int get_device_count() { return (int)device_ids.size(); }
|
||||
|
||||
bool is_ext_oneapi_device(const sycl::device &dev) {
|
||||
sycl::backend dev_backend = dev.get_backend();
|
||||
if (dev_backend == sycl::backend::ext_oneapi_level_zero ||
|
||||
dev_backend == sycl::backend::ext_oneapi_cuda ||
|
||||
dev_backend == sycl::backend::ext_oneapi_hip)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_sycl_device_info {
|
||||
int device_count;
|
||||
|
||||
int main_gpu_id = -1;
|
||||
ggml_sycl_backend_gpu_mode use_gpu_mode = SYCL_MUL_GPU_MODE;
|
||||
struct sycl_device_info {
|
||||
int cc; // compute capability
|
||||
int cc; // compute capability
|
||||
// int nsm; // number of streaming multiprocessors
|
||||
// size_t smpb; // max. shared memory per block
|
||||
bool vmm; // virtual memory support
|
||||
size_t total_vram;
|
||||
bool vmm; // virtual memory support
|
||||
size_t total_vram;
|
||||
};
|
||||
|
||||
sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
|
||||
|
||||
std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
|
||||
};
|
||||
|
||||
const ggml_sycl_device_info & ggml_sycl_info();
|
||||
sycl_device_mgr *local_sycl_device_mgr = NULL;
|
||||
|
||||
void print_gpu_device_list() {
|
||||
GGML_ASSERT(local_sycl_device_mgr);
|
||||
|
||||
char *hint = NULL;
|
||||
if (use_gpu_mode == SYCL_MUL_GPU_MODE) {
|
||||
hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n";
|
||||
fprintf(stderr, hint, local_sycl_device_mgr->get_device_count(),
|
||||
local_sycl_device_mgr->device_list.c_str(),
|
||||
local_sycl_device_mgr->max_compute_units[main_gpu_id]);
|
||||
} else {
|
||||
hint = "use main device [%d] with Max compute units:%d\n";
|
||||
fprintf(stderr, hint, main_gpu_id,
|
||||
local_sycl_device_mgr->max_compute_units[main_gpu_id]);
|
||||
}
|
||||
}
|
||||
|
||||
int work_group_size(int device_id) {
|
||||
GGML_ASSERT(local_sycl_device_mgr);
|
||||
return local_sycl_device_mgr->work_group_sizes[device_id];
|
||||
}
|
||||
|
||||
void refresh_device(ggml_sycl_backend_gpu_mode gpu_model,
|
||||
int p_main_gpu_id = 0) {
|
||||
main_gpu_id = p_main_gpu_id;
|
||||
use_gpu_mode = gpu_model;
|
||||
if (!local_sycl_device_mgr)
|
||||
delete local_sycl_device_mgr;
|
||||
|
||||
if (use_gpu_mode == SYCL_MUL_GPU_MODE) {
|
||||
local_sycl_device_mgr =
|
||||
new sycl_device_mgr(SYCL_DEVICE_FILTER_TOP_LEVEL_ZERO);
|
||||
} else {
|
||||
GGML_ASSERT(main_gpu_id >= 0);
|
||||
local_sycl_device_mgr = new sycl_device_mgr(SYCL_DEVICE_FILTER_ALL);
|
||||
}
|
||||
|
||||
device_count = local_sycl_device_mgr->get_device_count();
|
||||
|
||||
int64_t total_vram = 0;
|
||||
|
||||
for (int i = 0; i < device_count; ++i) {
|
||||
devices[i].vmm = 0;
|
||||
dpct::device_info prop;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
||||
prop, dpct::dev_mgr::instance().get_device(i))));
|
||||
|
||||
default_tensor_split[i] = total_vram;
|
||||
total_vram += prop.get_global_mem_size();
|
||||
|
||||
devices[i].cc =
|
||||
100 * prop.get_major_version() + 10 * prop.get_minor_version();
|
||||
}
|
||||
|
||||
for (int id = 0; id < device_count; ++id) {
|
||||
default_tensor_split[id] /= total_vram;
|
||||
}
|
||||
|
||||
g_work_group_size = work_group_size(main_gpu_id);
|
||||
print_gpu_device_list();
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct ggml_sycl_pool {
|
||||
virtual ~ggml_sycl_pool() = default;
|
||||
|
@ -262,16 +460,15 @@ struct ggml_backend_sycl_context {
|
|||
|
||||
queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
|
||||
|
||||
explicit ggml_backend_sycl_context(int device) :
|
||||
explicit ggml_backend_sycl_context(struct ggml_sycl_device_info &sycl_device_info, int device) :
|
||||
device(device),
|
||||
name(GGML_SYCL_NAME + std::to_string(device)) {
|
||||
qptrs[device][0] = sycl_device_info.local_sycl_device_mgr->queues[device];
|
||||
}
|
||||
|
||||
queue_ptr stream(int device, int stream) {
|
||||
if (qptrs[device][stream] == nullptr) {
|
||||
qptrs[device][stream] = &(dpct::get_current_device().default_queue());
|
||||
}
|
||||
return qptrs[device][stream];
|
||||
assert(qptrs[device][0] != nullptr);
|
||||
return qptrs[device][0];
|
||||
}
|
||||
|
||||
queue_ptr stream() {
|
||||
|
|
|
@ -17619,6 +17619,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
#elif defined(GGML_USE_SYCL)
|
||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
ggml_sycl_set_single_device(model->main_gpu);
|
||||
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue