refactor device in sycl_device, restore ctx in create_queue
This commit is contained in:
parent
e66117076c
commit
d5380f3af2
9 changed files with 495 additions and 481 deletions
|
@ -48,7 +48,7 @@ void ggml_sycl_get_device_description(int device, char * description, size_t d
|
|||
bool ggml_backend_is_sycl(ggml_backend_t backend);
|
||||
int ggml_backend_sycl_get_device(ggml_backend_t backend);
|
||||
static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer);
|
||||
|
||||
static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer);
|
||||
|
||||
void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
|
||||
const void *ptr_src, size_t size) {
|
||||
|
@ -2279,11 +2279,11 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
|
|||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
int id = ggml_backend_sycl_get_device_id(i);
|
||||
if (tensor_split[i] < (i + 1 < ggml_sycl_info().device_count ? tensor_split[i + 1] : 1.0f)) {
|
||||
if (min_compute_capability > ggml_sycl_info().devices[id].cc) {
|
||||
min_compute_capability = ggml_sycl_info().devices[id].cc;
|
||||
if (min_compute_capability > ggml_sycl_info().device_infos[id].cc) {
|
||||
min_compute_capability = ggml_sycl_info().device_infos[id].cc;
|
||||
}
|
||||
if (max_compute_capability < ggml_sycl_info().devices[id].cc) {
|
||||
max_compute_capability = ggml_sycl_info().devices[id].cc;
|
||||
if (max_compute_capability < ggml_sycl_info().device_infos[id].cc) {
|
||||
max_compute_capability = ggml_sycl_info().device_infos[id].cc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2680,17 +2680,14 @@ static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
|
|||
}
|
||||
|
||||
#ifdef NDEBUG
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
int id = ggml_backend_sycl_get_device_id(i);
|
||||
for (auto &id: ggml_sycl_info().ids) {
|
||||
SYCL_CHECK(ggml_sycl_set_device(id));
|
||||
}
|
||||
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
int id = ggml_backend_sycl_get_device_id(i);
|
||||
for (auto &id: ggml_sycl_info().ids) {
|
||||
SYCL_CHECK(ggml_sycl_set_device(id));
|
||||
|
||||
for (int i_other = 0; i_other < ggml_sycl_info().device_count; ++i_other) {
|
||||
int id_other = ggml_backend_sycl_get_device_id(i_other);
|
||||
for (auto &id_other: ggml_sycl_info().ids) {
|
||||
if (id == id_other) {
|
||||
continue;
|
||||
}
|
||||
|
@ -2843,7 +2840,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
} else {
|
||||
dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
|
||||
}
|
||||
|
||||
if (convert_src1_to_q8_1) {
|
||||
dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
|
||||
|
||||
|
@ -3165,8 +3161,13 @@ static void ggml_sycl_pad(ggml_backend_sycl_context & ctx, const ggml_tensor * s
|
|||
|
||||
static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
// log_tensor_with_cnt(ctx, "log/src0", src0, -1);
|
||||
// log_tensor_with_cnt(ctx, "log/src1", src1, -1);
|
||||
// log_tensor_with_cnt(ctx, "log/dst0", dst, -1);
|
||||
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_rms_norm);
|
||||
// log_tensor_with_cnt(ctx, "log/dst1", dst, -1);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
// exit(1);
|
||||
}
|
||||
|
||||
static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
|
@ -3417,12 +3418,12 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|||
continue;
|
||||
}
|
||||
|
||||
if (min_compute_capability > ggml_sycl_info().devices[id].cc) {
|
||||
min_compute_capability = ggml_sycl_info().devices[id].cc;
|
||||
if (min_compute_capability > ggml_sycl_info().device_infos[id].cc) {
|
||||
min_compute_capability = ggml_sycl_info().device_infos[id].cc;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
min_compute_capability = ggml_sycl_info().devices[ctx.device].cc;
|
||||
min_compute_capability = ggml_sycl_info().device_infos[ctx.device].cc;
|
||||
}
|
||||
|
||||
// check data types and tensor shapes for custom matrix multiplication kernels:
|
||||
|
@ -4332,7 +4333,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
|||
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
|
||||
|
||||
check_allow_device_id(device_id);
|
||||
|
@ -4345,7 +4345,9 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) {
|
|||
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
|
||||
int id = ggml_backend_sycl_get_device_id(i);
|
||||
auto & device = dpct::dev_mgr::instance().get_device(id);
|
||||
queue_ptr stream = &(device.default_queue());
|
||||
// queue_ptr stream = &(device.default_queue());
|
||||
queue_ptr stream = ggml_sycl_info().device_infos[id].qptrs[0];
|
||||
|
||||
ggml_backend_sycl_buffer_types[id] = {
|
||||
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
||||
/* .context = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), stream},
|
||||
|
|
|
@ -20,12 +20,16 @@ void* ggml_sycl_host_malloc(size_t size) try {
|
|||
if (getenv("GGML_SYCL_NO_PINNED") != nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// ggml_sycl_info().device_mgr->first_queue
|
||||
void* ptr = nullptr;
|
||||
// allow to use dpct::get_in_order_queue() for host malloc
|
||||
dpct::err0 err = CHECK_TRY_ERROR(
|
||||
ptr = (void*)sycl::malloc_host(size, dpct::get_in_order_queue()));
|
||||
auto q = dpct::get_in_order_queue();
|
||||
// sycl::queue q = *ggml_sycl_info().device_mgr->qptrs[0][0];
|
||||
|
||||
dpct::err0 err = CHECK_TRY_ERROR(
|
||||
ptr = (void*)sycl::malloc_host(size, q));
|
||||
|
||||
// printf("zjy ggml_sycl_host_malloc ptr=%p queue=%p size=%lu \n", ptr,q, size);
|
||||
if (err != 0) {
|
||||
// clear the error
|
||||
fprintf(
|
||||
|
@ -66,27 +70,6 @@ static inline int get_sycl_env(const char *env_name, int default_val) {
|
|||
return user_number;
|
||||
}
|
||||
|
||||
static inline bool env_existed(const char *env_name) {
|
||||
char *user_device_string = getenv(env_name);
|
||||
return user_device_string!=NULL;
|
||||
}
|
||||
|
||||
static std::vector<int> get_sycl_visible_devices() {
|
||||
static std::vector<int> device_ids;
|
||||
char *devices_env = getenv("GGML_SYCL_VISIBLE_DEVICES");
|
||||
if (devices_env != nullptr) {
|
||||
std::string devices(devices_env);
|
||||
std::replace(devices.begin(), devices.end(), ',', ' ');
|
||||
|
||||
std::stringstream ss(devices);
|
||||
int tmp;
|
||||
while (ss >> tmp) {
|
||||
device_ids.push_back(tmp);
|
||||
}
|
||||
}
|
||||
return device_ids;
|
||||
}
|
||||
|
||||
void print_device_detail_part1(int id, sycl::device &device, std::string device_type) {
|
||||
|
||||
dpct::device_info prop;
|
||||
|
@ -193,8 +176,7 @@ static ggml_sycl_device_info ggml_sycl_init() try {
|
|||
initialized = true;
|
||||
}
|
||||
|
||||
static ggml_sycl_device_info info = {};
|
||||
info.refresh_device();
|
||||
static ggml_sycl_device_info info;
|
||||
|
||||
if (info.device_count == 0) {
|
||||
fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": no available device found\n",
|
||||
|
@ -215,288 +197,4 @@ ggml_sycl_device_info &ggml_sycl_info() {
|
|||
return info;
|
||||
}
|
||||
|
||||
//--sycl_device_mgr--
|
||||
|
||||
sycl_device_mgr::sycl_device_mgr(
|
||||
ggml_sycl_backend_device_filter device_filter) {
|
||||
switch (device_filter) {
|
||||
case SYCL_DEVICES_TOP_LEVEL_ZERO:
|
||||
detect_sycl_gpu_list_with_max_cu();
|
||||
create_context_for_group_gpus();
|
||||
break;
|
||||
case SYCL_ALL_DEVICES:
|
||||
detect_all_sycl_device_list();
|
||||
create_context_for_devices();
|
||||
break;
|
||||
case SYCL_VISIBLE_DEVICES:
|
||||
detect_sycl_visible_device_list();
|
||||
create_context_for_devices();
|
||||
break;
|
||||
default:
|
||||
std::cerr << "sycl_device_mgr: Invalid device_filter " << device_filter
|
||||
<< std::endl;
|
||||
}
|
||||
init_allow_devices();
|
||||
}
|
||||
|
||||
/*
|
||||
Bind all gpus in same host with same context, for better performance in
|
||||
device-to-device copy in the future.
|
||||
*/
|
||||
void sycl_device_mgr::create_context_for_group_gpus() {
|
||||
sycl::context ctx = sycl::context(devices);
|
||||
assert(device_ids.size() > 0);
|
||||
first_queue = _create_queue_ptr(devices[0]);
|
||||
sycl::context ctx0 = first_queue->get_context();
|
||||
for (int i = 0; i < device_ids.size(); i++) {
|
||||
ctxs.push_back(ctx0);
|
||||
}
|
||||
}
|
||||
|
||||
sycl::queue *sycl_device_mgr::_create_queue_ptr(sycl::device device) {
|
||||
auto q = dpct::get_current_device().create_queue(device);
|
||||
return q;
|
||||
// _queues.push_back(q);
|
||||
// return & _queues.back();
|
||||
}
|
||||
|
||||
sycl::queue *sycl_device_mgr::create_queue_for_device(sycl::device &device) {
|
||||
dpct::select_device(dpct::dev_mgr::instance().get_device_id(device));
|
||||
auto qptr = _create_queue_ptr(device);
|
||||
return qptr;
|
||||
}
|
||||
|
||||
sycl::queue *sycl_device_mgr::create_queue_for_device_id(int device_id) {
|
||||
int i = get_device_index(device_id);
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(device_id);
|
||||
return create_queue_for_device(device);
|
||||
}
|
||||
|
||||
int sycl_device_mgr::get_device_index(int device_id) {
|
||||
for (int i = 0; i < device_ids.size(); i++) {
|
||||
if (device_ids[i] == device_id)
|
||||
return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
void sycl_device_mgr::create_context_for_devices() {
|
||||
for (int i = 0; i < device_ids.size(); i++) {
|
||||
sycl::context ctx = sycl::context(devices[i]);
|
||||
ctxs.push_back(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
void sycl_device_mgr::init_allow_devices() {
|
||||
device_list = "";
|
||||
for (size_t i = 0; i < device_ids.size(); ++i) {
|
||||
device_list += std::to_string(device_ids[i]);
|
||||
device_list += ",";
|
||||
}
|
||||
if (device_list.length() > 1) {
|
||||
device_list.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
bool sycl_device_mgr::is_allowed_device(int device_id) {
|
||||
return std::find(device_ids.begin(), device_ids.end(), device_id) !=
|
||||
device_ids.end();
|
||||
}
|
||||
|
||||
void sycl_device_mgr::detect_all_sycl_device_list() try {
|
||||
int device_count = dpct::dev_mgr::instance().device_count();
|
||||
|
||||
for (int id = 0; id < device_count; id++) {
|
||||
add_device_info(id);
|
||||
}
|
||||
return;
|
||||
} catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
void sycl_device_mgr::detect_sycl_visible_device_list() try {
|
||||
std::vector<int> sycl_devices = get_sycl_visible_devices();
|
||||
int device_count = dpct::dev_mgr::instance().device_count();
|
||||
|
||||
for (int i = 0; i < sycl_devices.size(); i++) {
|
||||
int id = sycl_devices[i];
|
||||
if (id >= device_count) {
|
||||
std::cerr << __func__ << ": invalid device_id:" << id
|
||||
<< " from GGML_SYCL_VISIBLE_DEVICES="
|
||||
<< getenv("GGML_SYCL_VISIBLE_DEVICES")
|
||||
<< ", available IDs: ";
|
||||
if (device_count > 1) {
|
||||
std::cerr << "[0, " << device_count - 1 << "]";
|
||||
} else if (device_count == 1) {
|
||||
std::cerr << "[0]";
|
||||
} else {
|
||||
std::cerr << "[]";
|
||||
}
|
||||
std::cerr << std::endl;
|
||||
}
|
||||
add_device_info(id);
|
||||
}
|
||||
return;
|
||||
} catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
Use all GPUs with same top max compute units
|
||||
*/
|
||||
void sycl_device_mgr::detect_sycl_gpu_list_with_max_cu() try {
|
||||
int device_count = dpct::dev_mgr::instance().device_count();
|
||||
int local_max_compute_units = 0;
|
||||
for (int id = 0; id < device_count; id++) {
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||
if (!device.is_gpu())
|
||||
continue;
|
||||
dpct::device_info prop;
|
||||
dpct::get_device_info(prop, device);
|
||||
if (local_max_compute_units < prop.get_max_compute_units())
|
||||
local_max_compute_units = prop.get_max_compute_units();
|
||||
}
|
||||
|
||||
for (int id = 0; id < device_count; id++) {
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||
if (!device.is_gpu())
|
||||
continue;
|
||||
dpct::device_info prop;
|
||||
dpct::get_device_info(prop, device);
|
||||
if (local_max_compute_units == prop.get_max_compute_units() &&
|
||||
is_ext_oneapi_device(device)) {
|
||||
add_device_info(id);
|
||||
}
|
||||
}
|
||||
return;
|
||||
} catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
int sycl_device_mgr::get_device_count() { return (int)device_ids.size(); }
|
||||
|
||||
bool sycl_device_mgr::is_ext_oneapi_device(const sycl::device &dev) {
|
||||
sycl::backend dev_backend = dev.get_backend();
|
||||
if (dev_backend == sycl::backend::ext_oneapi_level_zero ||
|
||||
dev_backend == sycl::backend::ext_oneapi_cuda ||
|
||||
dev_backend == sycl::backend::ext_oneapi_hip)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
void sycl_device_mgr::add_device_info(int id) {
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||
device_ids.push_back(id);
|
||||
devices.push_back(device);
|
||||
dpct::device_info prop;
|
||||
dpct::get_device_info(prop, device);
|
||||
work_group_sizes.push_back(prop.get_max_work_group_size());
|
||||
max_compute_units.push_back(prop.get_max_compute_units());
|
||||
hw_familys.push_back(get_device_family(&device));
|
||||
}
|
||||
|
||||
//--sycl_device_mgr--
|
||||
|
||||
//--ggml_sycl_device_info--
|
||||
void ggml_sycl_device_info::print_gpu_device_list() {
|
||||
GGML_ASSERT(device_mgr);
|
||||
|
||||
char *hint = NULL;
|
||||
if (oneapi_device_selector_existed && sycl_visible_devices_existed) {
|
||||
hint = "detect %d SYCL devices:[%s] by ONEAPI_DEVICE_SELECTOR=%s and "
|
||||
"GGML_SYCL_VISIBLE_DEVICES=%s\n";
|
||||
fprintf(stderr, hint, device_mgr->get_device_count(), devices_list(),
|
||||
getenv("ONEAPI_DEVICE_SELECTOR"),
|
||||
getenv("GGML_SYCL_VISIBLE_DEVICES"));
|
||||
} else if (oneapi_device_selector_existed) {
|
||||
hint = "detect %d SYCL devices:[%s] by ONEAPI_DEVICE_SELECTOR=%s\n";
|
||||
fprintf(stderr, hint, device_mgr->get_device_count(), devices_list(),
|
||||
getenv("ONEAPI_DEVICE_SELECTOR"));
|
||||
} else if (sycl_visible_devices_existed) {
|
||||
hint = "detect %d SYCL devices:[%s] by GGML_SYCL_VISIBLE_DEVICES=%s\n";
|
||||
fprintf(stderr, hint, device_mgr->get_device_count(), devices_list(),
|
||||
getenv("GGML_SYCL_VISIBLE_DEVICES"));
|
||||
} else {
|
||||
hint = "detect %d SYCL level-zero GPUs:[%s] with top Max compute "
|
||||
"units:%d, to use any SYCL devices, set/export "
|
||||
"GGML_SYCL_VISIBLE_DEVICES or ONEAPI_DEVICE_SELECTOR\n";
|
||||
fprintf(stderr, hint, device_mgr->get_device_count(), devices_list(),
|
||||
device_mgr->max_compute_units[0]);
|
||||
}
|
||||
}
|
||||
|
||||
int ggml_sycl_device_info::work_group_size(int device_id) {
|
||||
GGML_ASSERT(device_mgr);
|
||||
return device_mgr->work_group_sizes[device_id];
|
||||
}
|
||||
|
||||
void ggml_sycl_device_info::refresh_device() {
|
||||
oneapi_device_selector_existed = env_existed("ONEAPI_DEVICE_SELECTOR");
|
||||
sycl_visible_devices_existed = env_existed("GGML_SYCL_VISIBLE_DEVICES");
|
||||
if (!device_mgr)
|
||||
delete device_mgr;
|
||||
|
||||
if (sycl_visible_devices_existed) {
|
||||
device_mgr = new sycl_device_mgr(SYCL_VISIBLE_DEVICES);
|
||||
} else if (oneapi_device_selector_existed) {
|
||||
device_mgr = new sycl_device_mgr(SYCL_ALL_DEVICES);
|
||||
} else {
|
||||
device_mgr = new sycl_device_mgr(SYCL_DEVICES_TOP_LEVEL_ZERO);
|
||||
}
|
||||
|
||||
device_count = device_mgr->get_device_count();
|
||||
|
||||
int64_t total_vram = 0;
|
||||
|
||||
for (int i = 0; i < device_count; ++i) {
|
||||
int id = get_device_id(i);
|
||||
devices[id].vmm = 0;
|
||||
dpct::device_info prop;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
||||
prop, dpct::dev_mgr::instance().get_device(id))));
|
||||
|
||||
default_tensor_split[i] =
|
||||
total_vram; // continue data, so use device index
|
||||
total_vram += prop.get_global_mem_size();
|
||||
|
||||
devices[id].cc =
|
||||
100 * prop.get_major_version() + 10 * prop.get_minor_version();
|
||||
}
|
||||
|
||||
for (int i = 0; i < device_count; ++i) {
|
||||
default_tensor_split[i] /=
|
||||
total_vram; // continue data, so use device index
|
||||
}
|
||||
|
||||
print_gpu_device_list();
|
||||
}
|
||||
|
||||
bool ggml_sycl_device_info::is_allowed_device(int device_id) {
|
||||
return device_mgr->is_allowed_device(device_id);
|
||||
}
|
||||
|
||||
const char *ggml_sycl_device_info::devices_list() {
|
||||
return device_mgr->device_list.c_str();
|
||||
}
|
||||
|
||||
int ggml_sycl_device_info::get_device_id(int device_index) {
|
||||
if (device_index < device_mgr->device_ids.size()) {
|
||||
return device_mgr->device_ids.at(device_index);
|
||||
} else {
|
||||
std::cerr << __func__ << ":SYCL device:" << device_index
|
||||
<< " is out of range:[" << devices_list() << "]" << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
int ggml_sycl_device_info::hw_family(int device_id) {
|
||||
return device_mgr->hw_familys[device_id];
|
||||
}
|
||||
|
||||
//--ggml_sycl_device_info--
|
||||
|
|
|
@ -21,12 +21,13 @@
|
|||
#include "ggml-sycl.h"
|
||||
#include "presets.hpp"
|
||||
#include "sycl_hw.hpp"
|
||||
#include "sycl_device.hpp"
|
||||
|
||||
#define GGML_COMMON_DECL_SYCL
|
||||
#define GGML_COMMON_IMPL_SYCL
|
||||
#include "ggml-common.h"
|
||||
|
||||
void* ggml_sycl_host_malloc(size_t size);
|
||||
|
||||
void ggml_sycl_host_free(void* ptr);
|
||||
|
||||
static int g_ggml_sycl_debug = 0;
|
||||
|
@ -86,12 +87,6 @@ enum ggml_sycl_backend_gpu_mode {
|
|||
SYCL_MUL_GPU_MODE
|
||||
};
|
||||
|
||||
enum ggml_sycl_backend_device_filter {
|
||||
SYCL_ALL_DEVICES = 0,
|
||||
SYCL_DEVICES_TOP_LEVEL_ZERO,
|
||||
SYCL_VISIBLE_DEVICES
|
||||
};
|
||||
|
||||
static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||
|
||||
static void crash() {
|
||||
|
@ -169,10 +164,10 @@ inline dpct::err0 ggml_sycl_set_device(const int device_id) try {
|
|||
int current_device_id;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id()));
|
||||
|
||||
GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d, current_device_id=%d\n", device_id, current_device_id);
|
||||
if (device_id == current_device_id) {
|
||||
return 0;
|
||||
}
|
||||
GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d, current_device_id=%d\n", device_id, current_device_id);
|
||||
|
||||
return CHECK_TRY_ERROR(dpct::select_device(device_id));
|
||||
|
||||
|
@ -183,67 +178,6 @@ inline dpct::err0 ggml_sycl_set_device(const int device_id) try {
|
|||
std::exit(1);
|
||||
}
|
||||
|
||||
|
||||
class sycl_device_mgr {
|
||||
public:
|
||||
std::vector<int> device_ids;
|
||||
std::vector<sycl::device> devices;
|
||||
std::vector<int> max_compute_units;
|
||||
std::vector<int> work_group_sizes;
|
||||
std::vector<int> hw_familys;
|
||||
|
||||
sycl::queue *first_queue;
|
||||
std::vector<sycl::queue> _queues;
|
||||
std::vector<sycl::context> ctxs;
|
||||
std::string device_list = "";
|
||||
|
||||
sycl_device_mgr(ggml_sycl_backend_device_filter device_filter);
|
||||
|
||||
sycl::queue *_create_queue_ptr(sycl::device device); //internal API to hide dpct API.
|
||||
void create_context_for_group_gpus();
|
||||
sycl::queue *create_queue_for_device(sycl::device &device);
|
||||
sycl::queue *create_queue_for_device_id(int device_id);
|
||||
int get_device_index(int device_id);
|
||||
void create_context_for_devices();
|
||||
void init_allow_devices();
|
||||
bool is_allowed_device(int device_id);
|
||||
void detect_all_sycl_device_list();
|
||||
void detect_sycl_visible_device_list();
|
||||
void detect_sycl_gpu_list_with_max_cu();
|
||||
int get_device_count();
|
||||
bool is_ext_oneapi_device(const sycl::device &dev);
|
||||
void add_device_info(int id);
|
||||
};
|
||||
|
||||
|
||||
struct ggml_sycl_device_info {
|
||||
int device_count;
|
||||
bool oneapi_device_selector_existed = false;
|
||||
bool sycl_visible_devices_existed = false;
|
||||
|
||||
struct sycl_device_info {
|
||||
int cc; // compute capability
|
||||
// int nsm; // number of streaming multiprocessors
|
||||
// size_t smpb; // max. shared memory per block
|
||||
bool vmm; // virtual memory support
|
||||
size_t total_vram;
|
||||
};
|
||||
|
||||
sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
|
||||
|
||||
std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
|
||||
|
||||
sycl_device_mgr *device_mgr = NULL;
|
||||
|
||||
void print_gpu_device_list();
|
||||
int work_group_size(int device_id);
|
||||
void refresh_device();
|
||||
bool is_allowed_device(int device_id);
|
||||
const char* devices_list();
|
||||
int get_device_id(int device_index);
|
||||
int hw_family(int device_id);
|
||||
};
|
||||
|
||||
struct ggml_sycl_pool {
|
||||
virtual ~ggml_sycl_pool() = default;
|
||||
|
||||
|
@ -309,17 +243,17 @@ struct ggml_backend_sycl_context {
|
|||
|
||||
queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
|
||||
|
||||
explicit ggml_backend_sycl_context(struct ggml_sycl_device_info &sycl_device_info, int device_id) :
|
||||
device(device_id),
|
||||
explicit ggml_backend_sycl_context(struct ggml_sycl_device_info &sycl_device_info, int id) :
|
||||
device(id),
|
||||
name(GGML_SYCL_NAME + std::to_string(device)) {
|
||||
for (int i=0;i<GGML_SYCL_MAX_STREAMS; i++){
|
||||
qptrs[device_id][i] = sycl_device_info.device_mgr->create_queue_for_device_id(device_id);
|
||||
qptrs[id][i] = sycl_device_info.device_infos[id].qptrs[i];
|
||||
}
|
||||
}
|
||||
|
||||
queue_ptr stream(int device, int stream) {
|
||||
assert(qptrs[device][stream] != nullptr);
|
||||
return qptrs[device][stream];
|
||||
queue_ptr stream(int id, int stream) {
|
||||
assert(qptrs[id][stream] != nullptr);
|
||||
return qptrs[id][stream];
|
||||
}
|
||||
|
||||
queue_ptr stream() {
|
||||
|
@ -349,10 +283,10 @@ static inline void exit_with_stack_print() {
|
|||
|
||||
|
||||
static inline int get_sycl_env(const char *env_name, int default_val);
|
||||
static inline bool env_existed(const char *env_name);
|
||||
|
||||
void* ggml_sycl_host_malloc(size_t size);
|
||||
void ggml_sycl_host_free(void* ptr);
|
||||
static std::vector<int> get_sycl_visible_devices();
|
||||
|
||||
void ggml_backend_sycl_print_sycl_devices();
|
||||
static ggml_sycl_device_info ggml_sycl_init();
|
||||
ggml_sycl_device_info &ggml_sycl_info();
|
||||
|
|
|
@ -592,16 +592,18 @@ namespace dpct
|
|||
class device_ext : public sycl::device {
|
||||
typedef std::mutex mutex_type;
|
||||
|
||||
public:
|
||||
device_ext() : sycl::device() {}
|
||||
~device_ext() {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
clear_queues();
|
||||
}
|
||||
device_ext(const sycl::device &base) : sycl::device(base) {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
init_queues();
|
||||
}
|
||||
public:
|
||||
device_ext() : sycl::device(), _ctx(*this) {}
|
||||
~device_ext()
|
||||
{
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
clear_queues();
|
||||
}
|
||||
device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this)
|
||||
{
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
init_queues();
|
||||
}
|
||||
|
||||
int is_native_atomic_supported() { return 0; }
|
||||
int get_major_version() const { return dpct::get_major_version(*this); }
|
||||
|
@ -711,10 +713,10 @@ namespace dpct
|
|||
return create_in_order_queue(enable_exception_handler);
|
||||
}
|
||||
|
||||
sycl::queue *create_queue(sycl::device device,
|
||||
bool enable_exception_handler = false) {
|
||||
return create_in_order_queue(device, enable_exception_handler);
|
||||
}
|
||||
sycl::queue *create_queue(sycl::context context, sycl::device device,
|
||||
bool enable_exception_handler = false) {
|
||||
return create_in_order_queue(context, device, enable_exception_handler);
|
||||
}
|
||||
|
||||
sycl::queue *create_in_order_queue(bool enable_exception_handler = false) {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
|
@ -722,12 +724,12 @@ namespace dpct
|
|||
sycl::property::queue::in_order());
|
||||
}
|
||||
|
||||
sycl::queue *create_in_order_queue(sycl::device device,
|
||||
sycl::queue *create_in_order_queue(sycl::context context, sycl::device device,
|
||||
bool enable_exception_handler = false) {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
return create_queue_impl(device, enable_exception_handler,
|
||||
sycl::property::queue::in_order());
|
||||
}
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
return create_queue_impl(context, device, enable_exception_handler,
|
||||
sycl::property::queue::in_order());
|
||||
}
|
||||
|
||||
sycl::queue *create_out_of_order_queue(
|
||||
bool enable_exception_handler = false) {
|
||||
|
@ -735,24 +737,28 @@ namespace dpct
|
|||
return create_queue_impl(enable_exception_handler);
|
||||
}
|
||||
|
||||
void destroy_queue(sycl::queue *&queue) {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
_queues.erase(std::remove_if(_queues.begin(), _queues.end(),
|
||||
[=](const std::shared_ptr<sycl::queue> &q) -> bool
|
||||
{
|
||||
return q.get() == queue;
|
||||
}),
|
||||
_queues.end());
|
||||
queue = nullptr;
|
||||
}
|
||||
void set_saved_queue(sycl::queue *q) {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
_saved_queue = q;
|
||||
}
|
||||
sycl::queue *get_saved_queue() const {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
return _saved_queue;
|
||||
}
|
||||
void destroy_queue(sycl::queue *&queue)
|
||||
{
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
_queues.erase(std::remove_if(_queues.begin(), _queues.end(),
|
||||
[=](const std::shared_ptr<sycl::queue> &q) -> bool
|
||||
{
|
||||
return q.get() == queue;
|
||||
}),
|
||||
_queues.end());
|
||||
queue = nullptr;
|
||||
}
|
||||
void set_saved_queue(sycl::queue *q)
|
||||
{
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
_saved_queue = q;
|
||||
}
|
||||
sycl::queue *get_saved_queue() const
|
||||
{
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
return _saved_queue;
|
||||
}
|
||||
sycl::context get_context() const { return _ctx; }
|
||||
|
||||
private:
|
||||
void clear_queues() {
|
||||
|
@ -767,18 +773,19 @@ namespace dpct
|
|||
_saved_queue = &default_queue();
|
||||
}
|
||||
|
||||
/// Caller should acquire resource \p m_mutex before calling this
|
||||
/// function.
|
||||
template <class... Properties>
|
||||
sycl::queue *create_queue_impl(bool enable_exception_handler,
|
||||
Properties... properties) {
|
||||
sycl::async_handler eh = {};
|
||||
if (enable_exception_handler) {
|
||||
eh = exception_handler;
|
||||
}
|
||||
_queues.push_back(std::make_shared<sycl::queue>(
|
||||
*this, eh,
|
||||
sycl::property_list(
|
||||
/// Caller should acquire resource \p m_mutex before calling this function.
|
||||
template <class... Properties>
|
||||
sycl::queue *create_queue_impl(bool enable_exception_handler,
|
||||
Properties... properties)
|
||||
{
|
||||
sycl::async_handler eh = {};
|
||||
if (enable_exception_handler)
|
||||
{
|
||||
eh = exception_handler;
|
||||
}
|
||||
_queues.push_back(std::make_shared<sycl::queue>(
|
||||
_ctx, *this, eh,
|
||||
sycl::property_list(
|
||||
#ifdef DPCT_PROFILING_ENABLED
|
||||
sycl::property::queue::enable_profiling(),
|
||||
#endif
|
||||
|
@ -787,21 +794,21 @@ namespace dpct
|
|||
return _queues.back().get();
|
||||
}
|
||||
|
||||
template <class... Properties>
|
||||
sycl::queue *create_queue_impl(sycl::device device,
|
||||
template <class... Properties>
|
||||
sycl::queue *create_queue_impl(sycl::context context, sycl::device device,
|
||||
bool enable_exception_handler,
|
||||
Properties... properties) {
|
||||
sycl::async_handler eh = {};
|
||||
if (enable_exception_handler) {
|
||||
eh = exception_handler;
|
||||
}
|
||||
_queues.push_back(std::make_shared<sycl::queue>(
|
||||
device, eh,
|
||||
sycl::property_list(
|
||||
#ifdef DPCT_PROFILING_ENABLED
|
||||
sycl::property::queue::enable_profiling(),
|
||||
#endif
|
||||
properties...)));
|
||||
sycl::async_handler eh = {};
|
||||
if (enable_exception_handler) {
|
||||
eh = exception_handler;
|
||||
}
|
||||
_queues.push_back(std::make_shared<sycl::queue>(
|
||||
context, device, eh,
|
||||
sycl::property_list(
|
||||
#ifdef DPCT_PROFILING_ENABLED
|
||||
sycl::property::queue::enable_profiling(),
|
||||
#endif
|
||||
properties...)));
|
||||
|
||||
return _queues.back().get();
|
||||
}
|
||||
|
@ -811,6 +818,7 @@ namespace dpct
|
|||
}
|
||||
sycl::queue *_q_in_order, *_q_out_of_order;
|
||||
sycl::queue *_saved_queue;
|
||||
sycl::context _ctx;
|
||||
std::vector<std::shared_ptr<sycl::queue>> _queues;
|
||||
mutable mutex_type m_mutex;
|
||||
};
|
||||
|
|
|
@ -1779,7 +1779,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
|||
int id;
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(id = get_current_device_id()));
|
||||
const int compute_capability = ggml_sycl_info().devices[id].cc;
|
||||
const int compute_capability = ggml_sycl_info().device_infos[id].cc;
|
||||
|
||||
int mmq_x, mmq_y, nwarps;
|
||||
if (compute_capability >= VER_GEN13) {
|
||||
|
@ -1894,7 +1894,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
|||
int id;
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(id = get_current_device_id()));
|
||||
const int compute_capability = ggml_sycl_info().devices[id].cc;
|
||||
const int compute_capability = ggml_sycl_info().device_infos[id].cc;
|
||||
|
||||
int mmq_x, mmq_y, nwarps;
|
||||
if (compute_capability >= VER_GEN13) {
|
||||
|
@ -2009,7 +2009,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
|||
int id;
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(id = get_current_device_id()));
|
||||
const int compute_capability = ggml_sycl_info().devices[id].cc;
|
||||
const int compute_capability = ggml_sycl_info().device_infos[id].cc;
|
||||
|
||||
int mmq_x, mmq_y, nwarps;
|
||||
if (compute_capability >= VER_GEN13) {
|
||||
|
@ -2124,7 +2124,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|||
int id;
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(id = get_current_device_id()));
|
||||
const int compute_capability = ggml_sycl_info().devices[id].cc;
|
||||
const int compute_capability = ggml_sycl_info().device_infos[id].cc;
|
||||
|
||||
int mmq_x, mmq_y, nwarps;
|
||||
if (compute_capability >= VER_GEN13) {
|
||||
|
@ -2239,7 +2239,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
|||
int id;
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(id = get_current_device_id()));
|
||||
const int compute_capability = ggml_sycl_info().devices[id].cc;
|
||||
const int compute_capability = ggml_sycl_info().device_infos[id].cc;
|
||||
|
||||
int mmq_x, mmq_y, nwarps;
|
||||
if (compute_capability >= VER_GEN13) {
|
||||
|
@ -2354,7 +2354,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
|||
int id;
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(id = get_current_device_id()));
|
||||
const int compute_capability = ggml_sycl_info().devices[id].cc;
|
||||
const int compute_capability = ggml_sycl_info().device_infos[id].cc;
|
||||
|
||||
int mmq_x, mmq_y, nwarps;
|
||||
if (compute_capability >= VER_GEN13) {
|
||||
|
@ -2477,7 +2477,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|||
int id;
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(id = get_current_device_id()));
|
||||
const int compute_capability = ggml_sycl_info().devices[id].cc;
|
||||
const int compute_capability = ggml_sycl_info().device_infos[id].cc;
|
||||
|
||||
int mmq_x, mmq_y, nwarps;
|
||||
if (compute_capability >= VER_GEN13) {
|
||||
|
@ -2605,7 +2605,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|||
int id;
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(id = get_current_device_id()));
|
||||
const int compute_capability = ggml_sycl_info().devices[id].cc;
|
||||
const int compute_capability = ggml_sycl_info().device_infos[id].cc;
|
||||
|
||||
int mmq_x, mmq_y, nwarps;
|
||||
if (compute_capability >= VER_GEN13) {
|
||||
|
@ -2726,7 +2726,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|||
int id;
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(id = get_current_device_id()));
|
||||
const int compute_capability = ggml_sycl_info().devices[id].cc;
|
||||
const int compute_capability = ggml_sycl_info().device_infos[id].cc;
|
||||
|
||||
int mmq_x, mmq_y, nwarps;
|
||||
if (compute_capability >= VER_GEN13) {
|
||||
|
@ -2847,7 +2847,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|||
int id;
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(id = get_current_device_id()));
|
||||
const int compute_capability = ggml_sycl_info().devices[id].cc;
|
||||
const int compute_capability = ggml_sycl_info().device_infos[id].cc;
|
||||
|
||||
int mmq_x, mmq_y, nwarps;
|
||||
if (compute_capability >= VER_GEN13) {
|
||||
|
|
286
ggml/src/ggml-sycl/sycl_device.cpp
Normal file
286
ggml/src/ggml-sycl/sycl_device.cpp
Normal file
|
@ -0,0 +1,286 @@
|
|||
#include "sycl_device.hpp"
|
||||
#include "sycl_hw.hpp"
|
||||
|
||||
|
||||
void ggml_sycl_device_info::init(
|
||||
ggml_sycl_backend_device_filter device_filter) {
|
||||
switch (device_filter) {
|
||||
case SYCL_DEVICES_TOP_LEVEL_ZERO:
|
||||
detect_sycl_gpu_list_with_max_cu();
|
||||
create_context_for_devices();
|
||||
break;
|
||||
case SYCL_ALL_DEVICES:
|
||||
detect_all_sycl_device_list();
|
||||
create_context_for_devices();
|
||||
break;
|
||||
case SYCL_VISIBLE_DEVICES:
|
||||
detect_sycl_visible_device_list();
|
||||
create_context_for_devices();
|
||||
break;
|
||||
default:
|
||||
std::cerr << "ggml_sycl_device_info: Invalid device_filter " << device_filter
|
||||
<< std::endl;
|
||||
}
|
||||
init_allow_devices();
|
||||
device_count = ids.size();
|
||||
}
|
||||
|
||||
/*
|
||||
Bind all devices in same host with same context, for better performance in
|
||||
device-to-device copy in the future.
|
||||
*/
|
||||
void ggml_sycl_device_info::create_context_for_devices() {
|
||||
assert(devices.size() > 0);
|
||||
sycl::context ctx = sycl::context(devices);
|
||||
first_queue = dpct::get_current_device().create_queue(ctx, devices[0]);
|
||||
co_ctx = first_queue->get_context();
|
||||
}
|
||||
|
||||
sycl::queue *ggml_sycl_device_info::_create_queue_ptr(sycl::device device) {
|
||||
auto q = dpct::get_current_device().create_queue(co_ctx, device);
|
||||
return q;
|
||||
}
|
||||
|
||||
sycl::queue *ggml_sycl_device_info::create_queue_for_device(sycl::device &device) {
|
||||
dpct::select_device(dpct::dev_mgr::instance().get_device_id(device));
|
||||
auto qptr = _create_queue_ptr(device);
|
||||
return qptr;
|
||||
}
|
||||
|
||||
sycl::queue *ggml_sycl_device_info::create_queue_for_device_id(int id) {
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||
return create_queue_for_device(device);
|
||||
}
|
||||
|
||||
int ggml_sycl_device_info::get_device_index(int id) {
|
||||
for (int i = 0; i < ids.size(); i++) {
|
||||
if (ids[i] == id)
|
||||
return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
void ggml_sycl_device_info::init_allow_devices() {
|
||||
device_list = "";
|
||||
for (auto & id: ids) {
|
||||
device_list += std::to_string(id);
|
||||
device_list += ",";
|
||||
}
|
||||
if (device_list.length() > 1) {
|
||||
device_list.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
bool ggml_sycl_device_info::is_allowed_device(int id) {
|
||||
return std::find(ids.begin(), ids.end(), id) != ids.end();
|
||||
}
|
||||
|
||||
void ggml_sycl_device_info::detect_all_sycl_device_list() try {
|
||||
int all_device_count = dpct::dev_mgr::instance().device_count();
|
||||
|
||||
for (int id = 0; id < all_device_count; id++) {
|
||||
add_device_info(id);
|
||||
}
|
||||
return;
|
||||
} catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
std::vector<int> ggml_sycl_device_info::get_sycl_visible_devices() {
|
||||
static std::vector<int> device_ids;
|
||||
char *devices_env = getenv("GGML_SYCL_VISIBLE_DEVICES");
|
||||
if (devices_env != nullptr) {
|
||||
std::string devices(devices_env);
|
||||
std::replace(devices.begin(), devices.end(), ',', ' ');
|
||||
|
||||
std::stringstream ss(devices);
|
||||
int tmp;
|
||||
while (ss >> tmp) {
|
||||
device_ids.push_back(tmp);
|
||||
}
|
||||
}
|
||||
return device_ids;
|
||||
}
|
||||
|
||||
void ggml_sycl_device_info::detect_sycl_visible_device_list() try {
|
||||
std::vector<int> sycl_devices = get_sycl_visible_devices();
|
||||
int all_device_count = dpct::dev_mgr::instance().device_count();
|
||||
|
||||
for (auto & id: sycl_devices) {
|
||||
if (id >= all_device_count) {
|
||||
std::cerr << __func__ << ": invalid device_id:" << id
|
||||
<< " from GGML_SYCL_VISIBLE_DEVICES="
|
||||
<< getenv("GGML_SYCL_VISIBLE_DEVICES")
|
||||
<< ", available IDs: ";
|
||||
if (all_device_count > 1) {
|
||||
std::cerr << "[0, " << all_device_count - 1 << "]";
|
||||
} else if (all_device_count == 1) {
|
||||
std::cerr << "[0]";
|
||||
} else {
|
||||
std::cerr << "[]";
|
||||
}
|
||||
std::cerr << std::endl;
|
||||
}
|
||||
add_device_info(id);
|
||||
}
|
||||
return;
|
||||
} catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
Use all GPUs with same top max compute units
|
||||
*/
|
||||
void ggml_sycl_device_info::detect_sycl_gpu_list_with_max_cu() try {
|
||||
int all_device_count = dpct::dev_mgr::instance().device_count();
|
||||
int local_max_compute_units = 0;
|
||||
for (int id = 0; id < all_device_count; id++) {
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||
if (!device.is_gpu())
|
||||
continue;
|
||||
dpct::device_info prop;
|
||||
dpct::get_device_info(prop, device);
|
||||
if (local_max_compute_units < prop.get_max_compute_units())
|
||||
local_max_compute_units = prop.get_max_compute_units();
|
||||
}
|
||||
|
||||
for (int id = 0; id < all_device_count; id++) {
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||
if (!device.is_gpu())
|
||||
continue;
|
||||
dpct::device_info prop;
|
||||
dpct::get_device_info(prop, device);
|
||||
if (local_max_compute_units == prop.get_max_compute_units() &&
|
||||
is_ext_oneapi_device(device)) {
|
||||
add_device_info(id);
|
||||
}
|
||||
}
|
||||
return;
|
||||
} catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
int ggml_sycl_device_info::get_device_count() { return device_count; }
|
||||
|
||||
bool ggml_sycl_device_info::is_ext_oneapi_device(const sycl::device &dev) {
|
||||
sycl::backend dev_backend = dev.get_backend();
|
||||
if (dev_backend == sycl::backend::ext_oneapi_level_zero ||
|
||||
dev_backend == sycl::backend::ext_oneapi_cuda ||
|
||||
dev_backend == sycl::backend::ext_oneapi_hip)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
void ggml_sycl_device_info::add_device_info(int id) {
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||
dpct::device_info prop;
|
||||
dpct::get_device_info(prop, device);
|
||||
|
||||
ids.push_back(id);
|
||||
devices.push_back(device);
|
||||
|
||||
device_infos[id].id = id;
|
||||
device_infos[id].device = device;
|
||||
device_infos[id].max_work_group_sizes = prop.get_max_work_group_size();
|
||||
device_infos[id].max_compute_units = prop.get_max_compute_units();
|
||||
device_infos[id].hw_family = get_device_family(&device);
|
||||
for (int i=0; i<GGML_SYCL_MAX_STREAMS;i++) {
|
||||
device_infos[id].qptrs[i] = create_queue_for_device_id(id);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_sycl_device_info::print_gpu_device_list() {
|
||||
char *hint = NULL;
|
||||
if (oneapi_device_selector_existed && sycl_visible_devices_existed) {
|
||||
hint = "detect %d SYCL devices:[%s] by ONEAPI_DEVICE_SELECTOR=%s and "
|
||||
"GGML_SYCL_VISIBLE_DEVICES=%s\n";
|
||||
fprintf(stderr, hint, get_device_count(), devices_list(),
|
||||
getenv("ONEAPI_DEVICE_SELECTOR"),
|
||||
getenv("GGML_SYCL_VISIBLE_DEVICES"));
|
||||
} else if (oneapi_device_selector_existed) {
|
||||
hint = "detect %d SYCL devices:[%s] by ONEAPI_DEVICE_SELECTOR=%s\n";
|
||||
fprintf(stderr, hint, get_device_count(), devices_list(),
|
||||
getenv("ONEAPI_DEVICE_SELECTOR"));
|
||||
} else if (sycl_visible_devices_existed) {
|
||||
hint = "detect %d SYCL devices:[%s] by GGML_SYCL_VISIBLE_DEVICES=%s\n";
|
||||
fprintf(stderr, hint, get_device_count(), devices_list(),
|
||||
getenv("GGML_SYCL_VISIBLE_DEVICES"));
|
||||
} else {
|
||||
hint = "detect %d SYCL level-zero GPUs:[%s] with top Max compute "
|
||||
"units:%d, to use any SYCL devices, set/export "
|
||||
"GGML_SYCL_VISIBLE_DEVICES or ONEAPI_DEVICE_SELECTOR\n";
|
||||
fprintf(stderr, hint, get_device_count(), devices_list(),
|
||||
device_infos[0].max_compute_units);
|
||||
}
|
||||
}
|
||||
|
||||
int ggml_sycl_device_info::work_group_size(int id) {
|
||||
GGML_ASSERT(is_allowed_device(id));
|
||||
return device_infos[id].max_work_group_sizes;
|
||||
}
|
||||
|
||||
ggml_sycl_device_info::ggml_sycl_device_info() {
|
||||
oneapi_device_selector_existed = env_existed("ONEAPI_DEVICE_SELECTOR");
|
||||
sycl_visible_devices_existed = env_existed("GGML_SYCL_VISIBLE_DEVICES");
|
||||
|
||||
if (sycl_visible_devices_existed) {
|
||||
init(SYCL_VISIBLE_DEVICES);
|
||||
} else if (oneapi_device_selector_existed) {
|
||||
init(SYCL_ALL_DEVICES);
|
||||
} else {
|
||||
init(SYCL_DEVICES_TOP_LEVEL_ZERO);
|
||||
}
|
||||
|
||||
int64_t total_vram = 0;
|
||||
|
||||
for (int i = 0; i < device_count; ++i) {
|
||||
int id = get_device_id(i);
|
||||
device_infos[id].vmm = 0;
|
||||
dpct::device_info prop;
|
||||
dpct::get_device_info(
|
||||
prop, dpct::dev_mgr::instance().get_device(id));
|
||||
|
||||
// continue data, so use device index
|
||||
default_tensor_split[i] = total_vram;
|
||||
total_vram += prop.get_global_mem_size();
|
||||
|
||||
device_infos[id].cc =
|
||||
100 * prop.get_major_version() + 10 * prop.get_minor_version();
|
||||
}
|
||||
|
||||
// continue data, so use device index
|
||||
for (int i = 0; i < device_count; ++i) {
|
||||
default_tensor_split[i] /= total_vram;
|
||||
}
|
||||
|
||||
print_gpu_device_list();
|
||||
}
|
||||
|
||||
const char *ggml_sycl_device_info::devices_list() {
|
||||
return device_list.c_str();
|
||||
}
|
||||
|
||||
int ggml_sycl_device_info::get_device_id(int device_index) {
|
||||
if (device_index < device_count) {
|
||||
return ids.at(device_index);
|
||||
} else {
|
||||
std::cerr << __func__ << ":SYCL device:" << device_index
|
||||
<< " is out of range:[" << devices_list() << "]" << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
int ggml_sycl_device_info::hw_family(int id) {
|
||||
return device_infos[id].hw_family;
|
||||
}
|
||||
|
||||
static inline bool env_existed(const char *env_name) {
|
||||
char *user_device_string = getenv(env_name);
|
||||
return user_device_string!=NULL;
|
||||
}
|
83
ggml/src/ggml-sycl/sycl_device.hpp
Normal file
83
ggml/src/ggml-sycl/sycl_device.hpp
Normal file
|
@ -0,0 +1,83 @@
|
|||
#ifndef SYCL_DEVICE_HPP
|
||||
#define SYCL_DEVICE_HPP
|
||||
|
||||
#include <algorithm>
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
#include "dpct/helper.hpp"
|
||||
|
||||
#include "ggml-sycl.h"
|
||||
#include "presets.hpp"
|
||||
// #include "common.hpp"
|
||||
|
||||
enum ggml_sycl_backend_device_filter {
|
||||
SYCL_ALL_DEVICES = 0,
|
||||
SYCL_DEVICES_TOP_LEVEL_ZERO,
|
||||
SYCL_VISIBLE_DEVICES
|
||||
};
|
||||
|
||||
struct sycl_device_info {
|
||||
int cc; // compute capability
|
||||
// int nsm; // number of streaming multiprocessors
|
||||
// size_t smpb; // max. shared memory per block
|
||||
bool vmm; // virtual memory support
|
||||
size_t total_vram;
|
||||
|
||||
int id;
|
||||
sycl::device device;
|
||||
int max_compute_units;
|
||||
int max_work_group_sizes;
|
||||
int hw_family;
|
||||
sycl::context ctx;
|
||||
sycl::queue * qptrs[GGML_SYCL_MAX_STREAMS] = { nullptr };
|
||||
};
|
||||
|
||||
struct ggml_sycl_device_info {
|
||||
int device_count;
|
||||
bool oneapi_device_selector_existed = false;
|
||||
bool sycl_visible_devices_existed = false;
|
||||
std::vector<int> ids;
|
||||
std::vector<sycl::device> devices;
|
||||
sycl::queue *first_queue;
|
||||
std::string device_list;
|
||||
sycl::context co_ctx;
|
||||
|
||||
sycl_device_info device_infos[GGML_SYCL_MAX_DEVICES];
|
||||
std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
|
||||
|
||||
ggml_sycl_device_info();
|
||||
void init(ggml_sycl_backend_device_filter device_filter);
|
||||
|
||||
void print_gpu_device_list();
|
||||
int work_group_size(int device_id);
|
||||
bool is_allowed_device(int device_id);
|
||||
const char* devices_list();
|
||||
int get_device_id(int device_index);
|
||||
int hw_family(int device_id);
|
||||
|
||||
sycl::queue *_create_queue_ptr(sycl::device device); //internal API to hide dpct API.
|
||||
void create_context_for_group_gpus();
|
||||
sycl::queue *create_queue_for_device(sycl::device &device);
|
||||
sycl::queue *create_queue_for_device_id(int device_id);
|
||||
int get_device_index(int device_id);
|
||||
void create_context_for_devices();
|
||||
void init_allow_devices();
|
||||
void detect_all_sycl_device_list();
|
||||
void detect_sycl_visible_device_list();
|
||||
void detect_sycl_gpu_list_with_max_cu();
|
||||
int get_device_count();
|
||||
bool is_ext_oneapi_device(const sycl::device &dev);
|
||||
void add_device_info(int id);
|
||||
std::vector<sycl::device> get_devices();
|
||||
std::vector<int> get_sycl_visible_devices();
|
||||
|
||||
sycl::context &get_co_ctx() { return co_ctx; }
|
||||
|
||||
};
|
||||
|
||||
static inline bool env_existed(const char *env_name);
|
||||
|
||||
#endif // SYCL_DEVICE_HPP
|
|
@ -8,8 +8,14 @@ SYCL_HW_FAMILY get_device_family(sycl::device *device_ptr) {
|
|||
auto id = device_ptr->get_info<sycl::ext::intel::info::device::device_id>();
|
||||
auto id_prefix = id & 0xff00;
|
||||
|
||||
const std::vector<int> Xe_ARC = {0x5600, 0x4f00};
|
||||
const std::vector<int> Xe_Iris_IDs = {0x4900, 0xa700};
|
||||
const std::vector<int> UHD_IDs = {0x4600};
|
||||
|
||||
if (is_in_vector(Xe_Iris_IDs, id_prefix) or is_in_vector(UHD_IDs, id_prefix)) {
|
||||
return SYCL_HW_FAMILY_INTEL_IGPU;
|
||||
} else if (is_in_vector(Xe_ARC, id_prefix)) {
|
||||
return SYCL_HW_FAMILY_INTEL_ARC;
|
||||
} else {
|
||||
std::cerr << "No support PCI_ID: " << std::hex << id << std::endl;
|
||||
return SYCL_HW_FAMILY_UNKNOWN;
|
||||
|
|
|
@ -8,13 +8,10 @@
|
|||
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
// const int Xe_ARC[] = {0x5600, 0x4f};
|
||||
const std::vector<int> Xe_Iris_IDs = {0x4900, 0xa700};
|
||||
const std::vector<int> UHD_IDs = {0x4600};
|
||||
|
||||
enum SYCL_HW_FAMILY {
|
||||
SYCL_HW_FAMILY_UNKNOWN = -1,
|
||||
SYCL_HW_FAMILY_INTEL_IGPU = 0
|
||||
SYCL_HW_FAMILY_INTEL_IGPU = 0,
|
||||
SYCL_HW_FAMILY_INTEL_ARC = 1
|
||||
};
|
||||
|
||||
bool is_in_vector(std::vector<int> &vec, int item);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue