fix consistency issues with the usage of main_gpu

This commit is contained in:
slaren 2024-10-02 20:40:26 +02:00
parent b5516aab64
commit dc475c3b20

View file

@ -3418,22 +3418,25 @@ struct llama_lora_adapter {
} }
}; };
static size_t llama_get_device_count(const llama_model & model) { static int llama_get_device_count(const llama_model & model) {
size_t count = 1; int count = (int) model.devices.size();
count = model.devices.size();
#if defined(GGML_USE_SYCL)
count = ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
count = ggml_backend_vk_get_device_count();
#elif defined(GGML_USE_CANN)
return ggml_backend_cann_get_device_count();
#endif
#if defined(GGML_USE_RPC) #if defined(GGML_USE_RPC)
count += model.rpc_servers.size(); count += (int) model.rpc_servers.size();
#endif #endif
#if defined(GGML_USE_METAL)
count += 1;
#elif defined(GGML_USE_SYCL)
count += ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
count += ggml_backend_vk_get_device_count();
#elif defined(GGML_USE_CANN)
count += ggml_backend_cann_get_device_count();
#endif
return count; return count;
GGML_UNUSED(model); GGML_UNUSED(model);
} }
@ -3482,12 +3485,13 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
const char * endpoint = model.rpc_servers[device].c_str(); const char * endpoint = model.rpc_servers[device].c_str();
return ggml_backend_rpc_buffer_type(endpoint); return ggml_backend_rpc_buffer_type(endpoint);
} }
device = device - rpc_count; device -= rpc_count;
#endif #endif
if (device < (int)model.devices.size()) { if (device < (int)model.devices.size()) {
buft = ggml_backend_dev_buffer_type(model.devices[device]); return ggml_backend_dev_buffer_type(model.devices[device]);
} }
device -= (int)model.devices.size();
#if defined(GGML_USE_METAL) #if defined(GGML_USE_METAL)
buft = ggml_backend_metal_buffer_type(); buft = ggml_backend_metal_buffer_type();
@ -6965,6 +6969,13 @@ static bool llm_load_tensors(
void * progress_callback_user_data) { void * progress_callback_user_data) {
auto & hparams = model.hparams; auto & hparams = model.hparams;
// check if the value of main_gpu is valid
if (llama_get_device_count(model) > 0 &&
split_mode != LLAMA_SPLIT_MODE_LAYER &&
(main_gpu < 0 || main_gpu >= llama_get_device_count(model))) {
throw std::runtime_error(format("invalid value for main_gpu: %d (available devices: %d)", main_gpu, llama_get_device_count(model)));
}
model.split_mode = split_mode; model.split_mode = split_mode;
model.main_gpu = main_gpu; model.main_gpu = main_gpu;
model.n_gpu_layers = n_gpu_layers; model.n_gpu_layers = n_gpu_layers;
@ -19291,30 +19302,20 @@ struct llama_context * llama_new_context_with_model(
if (!hparams.vocab_only) { if (!hparams.vocab_only) {
// initialize backends // initialize backends
#if defined(GGML_USE_RPC) int main_gpu = model->main_gpu;
if (model->n_gpu_layers > 0) {
for (const auto & endpoint : model->rpc_servers) { // with registry
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str()); if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
if (main_gpu >= 0 && main_gpu < (int)model->devices.size()) {
ggml_backend_dev_t main_dev = model->devices[main_gpu];
ggml_backend_t backend = ggml_backend_dev_init(main_dev, nullptr);
if (backend == nullptr) { if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str()); LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(main_dev));
llama_free(ctx); llama_free(ctx);
return nullptr; return nullptr;
} }
ctx->backends.push_back(backend); ctx->backends.push_back(backend);
} }
}
#endif
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
// with split_mode LLAMA_SPLIT_MODE_NONE, only the main GPU backend is used
ggml_backend_dev_t main_dev = model->devices[model->main_gpu];
ggml_backend_t backend = ggml_backend_dev_init(main_dev, nullptr);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(main_dev));
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
} else { } else {
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
for (auto * dev : model->devices) { for (auto * dev : model->devices) {
@ -19327,6 +19328,26 @@ struct llama_context * llama_new_context_with_model(
ctx->backends.push_back(backend); ctx->backends.push_back(backend);
} }
} }
if (main_gpu >= (int)model->devices.size()) {
main_gpu -= (int)model->devices.size();
}
#if defined(GGML_USE_RPC)
if (model->n_gpu_layers > 0) {
for (const auto & endpoint : model->rpc_servers) {
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
}
if (main_gpu >= (int)model->rpc_servers.size()) {
main_gpu -= (int)model->rpc_servers.size();
}
#endif
#if defined(GGML_USE_METAL) #if defined(GGML_USE_METAL)
if (model->n_gpu_layers > 0) { if (model->n_gpu_layers > 0) {
@ -19345,7 +19366,7 @@ struct llama_context * llama_new_context_with_model(
return nullptr; return nullptr;
} }
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) { if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
ggml_backend_t backend = ggml_backend_vk_init(model->main_gpu); ggml_backend_t backend = ggml_backend_vk_init(main_gpu);
if (backend == nullptr) { if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__); LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
llama_free(ctx); llama_free(ctx);
@ -19366,9 +19387,9 @@ struct llama_context * llama_new_context_with_model(
#elif defined(GGML_USE_SYCL) #elif defined(GGML_USE_SYCL)
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); ggml_backend_t backend = ggml_backend_sycl_init(main_gpu);
if (backend == nullptr) { if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu); LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, main_gpu);
llama_free(ctx); llama_free(ctx);
return nullptr; return nullptr;
} }
@ -19387,7 +19408,7 @@ struct llama_context * llama_new_context_with_model(
} }
#elif defined(GGML_USE_KOMPUTE) #elif defined(GGML_USE_KOMPUTE)
if (model->n_gpu_layers > 0) { if (model->n_gpu_layers > 0) {
auto * backend = ggml_backend_kompute_init(model->main_gpu); auto * backend = ggml_backend_kompute_init(main_gpu);
if (backend == nullptr) { if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__); LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
llama_free(ctx); llama_free(ctx);
@ -19396,29 +19417,29 @@ struct llama_context * llama_new_context_with_model(
ctx->backends.push_back(backend); ctx->backends.push_back(backend);
} }
#elif defined(GGML_USE_CANN) #elif defined(GGML_USE_CANN)
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
// TODO: ggml_backend_cann is not support split tensor now, just leave code here. // TODO: ggml_backend_cann is not support split tensor now, just leave code here.
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
ggml_backend_t backend = ggml_backend_cann_init(model->main_gpu); ggml_backend_t backend = ggml_backend_cann_init(main_gpu);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
} else {
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
// TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
ggml_backend_t backend = ggml_backend_cann_init(device);
if (backend == nullptr) { if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device); LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, main_gpu);
llama_free(ctx); llama_free(ctx);
return nullptr; return nullptr;
} }
ctx->backends.push_back(backend); ctx->backends.push_back(backend);
} else {
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
// TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
ggml_backend_t backend = ggml_backend_cann_init(device);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
} }
}
#endif #endif
#ifdef GGML_USE_BLAS #ifdef GGML_USE_BLAS