support set main gpu
This commit is contained in:
parent
d5380f3af2
commit
1947c1200e
7 changed files with 112 additions and 31 deletions
|
@ -38,6 +38,8 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
|
||||||
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int index);
|
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int index);
|
||||||
GGML_API GGML_CALL void ggml_sycl_set_single_device(int main_gpu_id);
|
GGML_API GGML_CALL void ggml_sycl_set_single_device(int main_gpu_id);
|
||||||
|
|
||||||
|
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
|
||||||
|
|
||||||
// SYCL doesn't support registering host memory, keep here for reference
|
// SYCL doesn't support registering host memory, keep here for reference
|
||||||
// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
|
// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
|
||||||
// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
|
// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
|
||||||
|
|
|
@ -38,6 +38,7 @@
|
||||||
|
|
||||||
#include "ggml-sycl/backend.hpp"
|
#include "ggml-sycl/backend.hpp"
|
||||||
#include "ggml-sycl/presets.hpp"
|
#include "ggml-sycl/presets.hpp"
|
||||||
|
#include "ggml-sycl/sycl_device.hpp"
|
||||||
|
|
||||||
|
|
||||||
void ggml_sycl_free_data(struct ggml_tensor * tensor);
|
void ggml_sycl_free_data(struct ggml_tensor * tensor);
|
||||||
|
@ -5150,6 +5151,13 @@ GGML_CALL int ggml_backend_sycl_get_device_count() {
|
||||||
return ggml_sycl_info().device_count;
|
return ggml_sycl_info().device_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
|
||||||
|
|
||||||
|
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_single_device_mode\n");
|
||||||
|
fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
|
||||||
|
ggml_sycl_info(main_gpu_id);
|
||||||
|
}
|
||||||
|
|
||||||
GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) {
|
GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) {
|
||||||
ggml_backend_t sycl_backend = ggml_backend_sycl_init((int) (intptr_t) user_data);
|
ggml_backend_t sycl_backend = ggml_backend_sycl_init((int) (intptr_t) user_data);
|
||||||
return sycl_backend;
|
return sycl_backend;
|
||||||
|
|
|
@ -137,7 +137,7 @@ void ggml_backend_sycl_print_sycl_devices() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_sycl_device_info ggml_sycl_init() try {
|
static ggml_sycl_device_info ggml_sycl_init(int main_gpu_id) try {
|
||||||
static bool initialized = false;
|
static bool initialized = false;
|
||||||
|
|
||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
|
@ -176,7 +176,7 @@ static ggml_sycl_device_info ggml_sycl_init() try {
|
||||||
initialized = true;
|
initialized = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_sycl_device_info info;
|
static ggml_sycl_device_info info(main_gpu_id);
|
||||||
|
|
||||||
if (info.device_count == 0) {
|
if (info.device_count == 0) {
|
||||||
fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": no available device found\n",
|
fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": no available device found\n",
|
||||||
|
@ -192,8 +192,8 @@ static ggml_sycl_device_info ggml_sycl_init() try {
|
||||||
std::exit(1);
|
std::exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_sycl_device_info &ggml_sycl_info() {
|
ggml_sycl_device_info &ggml_sycl_info(int main_gpu_id) {
|
||||||
static ggml_sycl_device_info info = ggml_sycl_init();
|
static ggml_sycl_device_info info = ggml_sycl_init(main_gpu_id);
|
||||||
return info;
|
return info;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -81,12 +81,6 @@ static int g_ggml_sycl_debug = 0;
|
||||||
|
|
||||||
typedef sycl::queue *queue_ptr;
|
typedef sycl::queue *queue_ptr;
|
||||||
|
|
||||||
enum ggml_sycl_backend_gpu_mode {
|
|
||||||
SYCL_UNSET_GPU_MODE = -1,
|
|
||||||
SYCL_SINGLE_GPU_MODE = 0,
|
|
||||||
SYCL_MUL_GPU_MODE
|
|
||||||
};
|
|
||||||
|
|
||||||
static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||||
|
|
||||||
static void crash() {
|
static void crash() {
|
||||||
|
@ -288,8 +282,8 @@ void* ggml_sycl_host_malloc(size_t size);
|
||||||
void ggml_sycl_host_free(void* ptr);
|
void ggml_sycl_host_free(void* ptr);
|
||||||
|
|
||||||
void ggml_backend_sycl_print_sycl_devices();
|
void ggml_backend_sycl_print_sycl_devices();
|
||||||
static ggml_sycl_device_info ggml_sycl_init();
|
static ggml_sycl_device_info ggml_sycl_init(int main_gpu_id);
|
||||||
ggml_sycl_device_info &ggml_sycl_info();
|
ggml_sycl_device_info &ggml_sycl_info(int main_gpu_id = -1);
|
||||||
|
|
||||||
// common device functions
|
// common device functions
|
||||||
|
|
||||||
|
|
|
@ -7,22 +7,54 @@ void ggml_sycl_device_info::init(
|
||||||
switch (device_filter) {
|
switch (device_filter) {
|
||||||
case SYCL_DEVICES_TOP_LEVEL_ZERO:
|
case SYCL_DEVICES_TOP_LEVEL_ZERO:
|
||||||
detect_sycl_gpu_list_with_max_cu();
|
detect_sycl_gpu_list_with_max_cu();
|
||||||
create_context_for_devices();
|
|
||||||
break;
|
break;
|
||||||
case SYCL_ALL_DEVICES:
|
case SYCL_ALL_DEVICES:
|
||||||
detect_all_sycl_device_list();
|
detect_all_sycl_device_list();
|
||||||
create_context_for_devices();
|
|
||||||
break;
|
break;
|
||||||
case SYCL_VISIBLE_DEVICES:
|
case SYCL_VISIBLE_DEVICES:
|
||||||
detect_sycl_visible_device_list();
|
detect_sycl_visible_device_list();
|
||||||
create_context_for_devices();
|
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
std::cerr << "ggml_sycl_device_info: Invalid device_filter " << device_filter
|
std::cerr << "ggml_sycl_device_info: Invalid device_filter " << device_filter
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
}
|
}
|
||||||
init_allow_devices();
|
init_devices_dynamic_info();
|
||||||
|
m_device_filter = device_filter;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_device_info::clear_device_infos() {
|
||||||
|
ids.clear();
|
||||||
|
devices.clear();
|
||||||
|
|
||||||
|
for (int id=0;id<GGML_SYCL_MAX_DEVICES;id++) {
|
||||||
|
device_infos[id].id = -1;
|
||||||
|
device_infos[id].max_work_group_sizes = 0;
|
||||||
|
device_infos[id].max_compute_units = 0;
|
||||||
|
device_infos[id].hw_family = -1;
|
||||||
|
// for (int i=0; i<GGML_SYCL_MAX_STREAMS;i++) {
|
||||||
|
// free(device_infos[id].qptrs[i]);
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
|
||||||
|
device_count = 0;
|
||||||
|
device_list = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_device_info::init_single_mode(int main_gpu_id) {
|
||||||
|
GGML_ASSERT(main_gpu_id<dpct::dev_mgr::instance().device_count());
|
||||||
|
|
||||||
|
clear_device_infos();
|
||||||
|
add_device_info(main_gpu_id);
|
||||||
|
init_devices_dynamic_info();
|
||||||
|
device_mode = SYCL_SINGLE_GPU_MODE;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_device_info::init_devices_dynamic_info() {
|
||||||
|
create_context_for_devices();
|
||||||
|
set_allow_devices();
|
||||||
device_count = ids.size();
|
device_count = ids.size();
|
||||||
|
create_queues_for_devices();
|
||||||
|
update_mem();
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -60,7 +92,7 @@ int ggml_sycl_device_info::get_device_index(int id) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_sycl_device_info::init_allow_devices() {
|
void ggml_sycl_device_info::set_allow_devices() {
|
||||||
device_list = "";
|
device_list = "";
|
||||||
for (auto & id: ids) {
|
for (auto & id: ids) {
|
||||||
device_list += std::to_string(id);
|
device_list += std::to_string(id);
|
||||||
|
@ -190,11 +222,22 @@ void ggml_sycl_device_info::add_device_info(int id) {
|
||||||
device_infos[id].max_work_group_sizes = prop.get_max_work_group_size();
|
device_infos[id].max_work_group_sizes = prop.get_max_work_group_size();
|
||||||
device_infos[id].max_compute_units = prop.get_max_compute_units();
|
device_infos[id].max_compute_units = prop.get_max_compute_units();
|
||||||
device_infos[id].hw_family = get_device_family(&device);
|
device_infos[id].hw_family = get_device_family(&device);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_device_info::create_queues(int id) {
|
||||||
for (int i=0; i<GGML_SYCL_MAX_STREAMS;i++) {
|
for (int i=0; i<GGML_SYCL_MAX_STREAMS;i++) {
|
||||||
device_infos[id].qptrs[i] = create_queue_for_device_id(id);
|
device_infos[id].qptrs[i] = create_queue_for_device_id(id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_device_info::create_queues_for_devices() {
|
||||||
|
for (auto &id: ids) {
|
||||||
|
for (int i=0; i<GGML_SYCL_MAX_STREAMS;i++) {
|
||||||
|
device_infos[id].qptrs[i] = create_queue_for_device_id(id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_sycl_device_info::print_gpu_device_list() {
|
void ggml_sycl_device_info::print_gpu_device_list() {
|
||||||
char *hint = NULL;
|
char *hint = NULL;
|
||||||
if (oneapi_device_selector_existed && sycl_visible_devices_existed) {
|
if (oneapi_device_selector_existed && sycl_visible_devices_existed) {
|
||||||
|
@ -225,16 +268,12 @@ int ggml_sycl_device_info::work_group_size(int id) {
|
||||||
return device_infos[id].max_work_group_sizes;
|
return device_infos[id].max_work_group_sizes;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_sycl_device_info::ggml_sycl_device_info() {
|
void ggml_sycl_device_info::update_mem() {
|
||||||
oneapi_device_selector_existed = env_existed("ONEAPI_DEVICE_SELECTOR");
|
|
||||||
sycl_visible_devices_existed = env_existed("GGML_SYCL_VISIBLE_DEVICES");
|
|
||||||
|
|
||||||
if (sycl_visible_devices_existed) {
|
for (int i = 0; i < GGML_SYCL_MAX_DEVICES; ++i) {
|
||||||
init(SYCL_VISIBLE_DEVICES);
|
device_infos[i].vmm = 0;
|
||||||
} else if (oneapi_device_selector_existed) {
|
default_tensor_split[i] = 0;
|
||||||
init(SYCL_ALL_DEVICES);
|
device_infos[i].cc =0;
|
||||||
} else {
|
|
||||||
init(SYCL_DEVICES_TOP_LEVEL_ZERO);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t total_vram = 0;
|
int64_t total_vram = 0;
|
||||||
|
@ -258,6 +297,23 @@ ggml_sycl_device_info::ggml_sycl_device_info() {
|
||||||
for (int i = 0; i < device_count; ++i) {
|
for (int i = 0; i < device_count; ++i) {
|
||||||
default_tensor_split[i] /= total_vram;
|
default_tensor_split[i] /= total_vram;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_sycl_device_info::ggml_sycl_device_info(int main_gpu_id) {
|
||||||
|
oneapi_device_selector_existed = env_existed("ONEAPI_DEVICE_SELECTOR");
|
||||||
|
sycl_visible_devices_existed = env_existed("GGML_SYCL_VISIBLE_DEVICES");
|
||||||
|
|
||||||
|
if (main_gpu_id == -1) {
|
||||||
|
if (sycl_visible_devices_existed) {
|
||||||
|
init(SYCL_VISIBLE_DEVICES);
|
||||||
|
} else if (oneapi_device_selector_existed) {
|
||||||
|
init(SYCL_ALL_DEVICES);
|
||||||
|
} else {
|
||||||
|
init(SYCL_DEVICES_TOP_LEVEL_ZERO);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
init_single_mode(main_gpu_id);
|
||||||
|
}
|
||||||
|
|
||||||
print_gpu_device_list();
|
print_gpu_device_list();
|
||||||
}
|
}
|
||||||
|
@ -272,6 +328,8 @@ int ggml_sycl_device_info::get_device_id(int device_index) {
|
||||||
} else {
|
} else {
|
||||||
std::cerr << __func__ << ":SYCL device:" << device_index
|
std::cerr << __func__ << ":SYCL device:" << device_index
|
||||||
<< " is out of range:[" << devices_list() << "]" << std::endl;
|
<< " is out of range:[" << devices_list() << "]" << std::endl;
|
||||||
|
int* ptr = NULL;
|
||||||
|
*ptr = 0;
|
||||||
std::exit(1);
|
std::exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,13 @@ enum ggml_sycl_backend_device_filter {
|
||||||
SYCL_VISIBLE_DEVICES
|
SYCL_VISIBLE_DEVICES
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum ggml_sycl_backend_gpu_mode {
|
||||||
|
SYCL_UNSET_GPU_MODE = -1,
|
||||||
|
SYCL_SINGLE_GPU_MODE = 0,
|
||||||
|
SYCL_MUL_GPU_MODE
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
struct sycl_device_info {
|
struct sycl_device_info {
|
||||||
int cc; // compute capability
|
int cc; // compute capability
|
||||||
// int nsm; // number of streaming multiprocessors
|
// int nsm; // number of streaming multiprocessors
|
||||||
|
@ -36,6 +43,7 @@ struct sycl_device_info {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_sycl_device_info {
|
struct ggml_sycl_device_info {
|
||||||
|
int device_mode = SYCL_MUL_GPU_MODE;
|
||||||
int device_count;
|
int device_count;
|
||||||
bool oneapi_device_selector_existed = false;
|
bool oneapi_device_selector_existed = false;
|
||||||
bool sycl_visible_devices_existed = false;
|
bool sycl_visible_devices_existed = false;
|
||||||
|
@ -44,13 +52,17 @@ struct ggml_sycl_device_info {
|
||||||
sycl::queue *first_queue;
|
sycl::queue *first_queue;
|
||||||
std::string device_list;
|
std::string device_list;
|
||||||
sycl::context co_ctx;
|
sycl::context co_ctx;
|
||||||
|
int m_device_filter;
|
||||||
|
|
||||||
sycl_device_info device_infos[GGML_SYCL_MAX_DEVICES];
|
sycl_device_info device_infos[GGML_SYCL_MAX_DEVICES];
|
||||||
std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
|
std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
|
||||||
|
|
||||||
ggml_sycl_device_info();
|
ggml_sycl_device_info(int main_gpu_id);//single device mode
|
||||||
void init(ggml_sycl_backend_device_filter device_filter);
|
|
||||||
|
|
||||||
|
void init(ggml_sycl_backend_device_filter device_filter);
|
||||||
|
void init_single_mode(int main_gpu_id);
|
||||||
|
|
||||||
|
void clear_device_infos();
|
||||||
void print_gpu_device_list();
|
void print_gpu_device_list();
|
||||||
int work_group_size(int device_id);
|
int work_group_size(int device_id);
|
||||||
bool is_allowed_device(int device_id);
|
bool is_allowed_device(int device_id);
|
||||||
|
@ -64,15 +76,19 @@ struct ggml_sycl_device_info {
|
||||||
sycl::queue *create_queue_for_device_id(int device_id);
|
sycl::queue *create_queue_for_device_id(int device_id);
|
||||||
int get_device_index(int device_id);
|
int get_device_index(int device_id);
|
||||||
void create_context_for_devices();
|
void create_context_for_devices();
|
||||||
void init_allow_devices();
|
void set_allow_devices();
|
||||||
void detect_all_sycl_device_list();
|
void detect_all_sycl_device_list();
|
||||||
void detect_sycl_visible_device_list();
|
void detect_sycl_visible_device_list();
|
||||||
void detect_sycl_gpu_list_with_max_cu();
|
void detect_sycl_gpu_list_with_max_cu();
|
||||||
int get_device_count();
|
int get_device_count();
|
||||||
bool is_ext_oneapi_device(const sycl::device &dev);
|
bool is_ext_oneapi_device(const sycl::device &dev);
|
||||||
void add_device_info(int id);
|
void add_device_info(int id);
|
||||||
|
void create_queues(int id);
|
||||||
|
void create_queues_for_devices();
|
||||||
std::vector<sycl::device> get_devices();
|
std::vector<sycl::device> get_devices();
|
||||||
std::vector<int> get_sycl_visible_devices();
|
std::vector<int> get_sycl_visible_devices();
|
||||||
|
void update_mem();
|
||||||
|
void init_devices_dynamic_info();
|
||||||
|
|
||||||
sycl::context &get_co_ctx() { return co_ctx; }
|
sycl::context &get_co_ctx() { return co_ctx; }
|
||||||
|
|
||||||
|
|
|
@ -2831,8 +2831,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
buft = ggml_backend_vk_buffer_type(gpu);
|
buft = ggml_backend_vk_buffer_type(gpu);
|
||||||
#elif defined(GGML_USE_SYCL)
|
#elif defined(GGML_USE_SYCL)
|
||||||
int gpu_id = ggml_backend_sycl_get_device_id(gpu);
|
buft = ggml_backend_sycl_buffer_type(gpu);
|
||||||
buft = ggml_backend_sycl_buffer_type(gpu_id);
|
|
||||||
#elif defined(GGML_USE_KOMPUTE)
|
#elif defined(GGML_USE_KOMPUTE)
|
||||||
buft = ggml_backend_kompute_buffer_type(gpu);
|
buft = ggml_backend_kompute_buffer_type(gpu);
|
||||||
if (buft == nullptr) {
|
if (buft == nullptr) {
|
||||||
|
@ -5931,6 +5930,10 @@ static bool llm_load_tensors(
|
||||||
model.buft_output = llama_default_buffer_type_cpu(true);
|
model.buft_output = llama_default_buffer_type_cpu(true);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
|
#if defined(GGML_USE_SYCL)
|
||||||
|
ggml_backend_sycl_set_single_device_mode(main_gpu);
|
||||||
|
#endif
|
||||||
ggml_backend_buffer_type_t split_buft;
|
ggml_backend_buffer_type_t split_buft;
|
||||||
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||||
split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
|
split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue