rpc : make RPC servers come first in the device list

This commit is contained in:
Radoslav Gerganov 2024-08-30 17:40:37 +03:00
parent 9379d3cc17
commit 0178724414

View file

@ -3346,29 +3346,29 @@ static size_t llama_get_device_count(const llama_model & model) {
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) { static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
ggml_backend_buffer_type_t buft = nullptr; ggml_backend_buffer_type_t buft = nullptr;
#if defined(GGML_USE_RPC)
int dev_count = (int)llama_get_device_count(model);
int rpc_count = (int)model.rpc_servers.size(); int rpc_count = (int)model.rpc_servers.size();
if (gpu >= dev_count - rpc_count) { int local_gpu = gpu - rpc_count;
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str(); #if defined(GGML_USE_RPC)
if (gpu < rpc_count) {
const char * endpoint = model.rpc_servers[gpu].c_str();
return ggml_backend_rpc_buffer_type(endpoint); return ggml_backend_rpc_buffer_type(endpoint);
} }
#endif #endif
#if defined(GGML_USE_METAL) #if defined(GGML_USE_METAL)
buft = ggml_backend_metal_buffer_type(); buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUDA) #elif defined(GGML_USE_CUDA)
buft = ggml_backend_cuda_buffer_type(gpu); buft = ggml_backend_cuda_buffer_type(local_gpu);
#elif defined(GGML_USE_VULKAN) #elif defined(GGML_USE_VULKAN)
buft = ggml_backend_vk_buffer_type(gpu); buft = ggml_backend_vk_buffer_type(local_gpu);
#elif defined(GGML_USE_SYCL) #elif defined(GGML_USE_SYCL)
buft = ggml_backend_sycl_buffer_type(gpu); buft = ggml_backend_sycl_buffer_type(local_gpu);
#elif defined(GGML_USE_KOMPUTE) #elif defined(GGML_USE_KOMPUTE)
buft = ggml_backend_kompute_buffer_type(gpu); buft = ggml_backend_kompute_buffer_type(local_gpu);
if (buft == nullptr) { if (buft == nullptr) {
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu); LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu);
} }
#elif defined(GGML_USE_CANN) #elif defined(GGML_USE_CANN)
buft = ggml_backend_cann_buffer_type(gpu); buft = ggml_backend_cann_buffer_type(local_gpu);
#endif #endif
if (buft == nullptr) { if (buft == nullptr) {
@ -3376,7 +3376,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
} }
return buft; return buft;
GGML_UNUSED(model); GGML_UNUSED(model);
GGML_UNUSED(gpu); GGML_UNUSED(local_gpu);
} }
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) { static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
@ -3403,13 +3403,13 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
} }
static size_t llama_get_device_memory(const llama_model & model, int device) { static size_t llama_get_device_memory(const llama_model & model, int device) {
#if defined(GGML_USE_RPC)
int dev_count = (int)llama_get_device_count(model);
int rpc_count = (int)model.rpc_servers.size(); int rpc_count = (int)model.rpc_servers.size();
if (device >= dev_count - rpc_count) { int local_device = device - rpc_count;
#if defined(GGML_USE_RPC)
if (device < rpc_count) {
size_t total; size_t total;
size_t free; size_t free;
const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str(); const char * endpoint = model.rpc_servers[device].c_str();
ggml_backend_rpc_get_device_memory(endpoint, &free, &total); ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
return free; return free;
} }
@ -3417,28 +3417,28 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
#if defined(GGML_USE_CUDA) #if defined(GGML_USE_CUDA)
size_t total; size_t total;
size_t free; size_t free;
ggml_backend_cuda_get_device_memory(device, &free, &total); ggml_backend_cuda_get_device_memory(local_device, &free, &total);
return free; return free;
#elif defined(GGML_USE_SYCL) #elif defined(GGML_USE_SYCL)
size_t total; size_t total;
size_t free; size_t free;
ggml_backend_sycl_get_device_memory(device, &free, &total); ggml_backend_sycl_get_device_memory(local_device, &free, &total);
return free; return free;
#elif defined(GGML_USE_VULKAN) #elif defined(GGML_USE_VULKAN)
size_t total; size_t total;
size_t free; size_t free;
ggml_backend_vk_get_device_memory(device, &free, &total); ggml_backend_vk_get_device_memory(local_device, &free, &total);
return free; return free;
#elif defined(GGML_USE_CANN) #elif defined(GGML_USE_CANN)
size_t total; size_t total;
size_t free; size_t free;
ggml_backend_cann_get_device_memory(device, &free, &total); ggml_backend_cann_get_device_memory(local_device, &free, &total);
return free; return free;
#else #else
return 1; return 1;
#endif #endif
GGML_UNUSED(model); GGML_UNUSED(model);
GGML_UNUSED(device); GGML_UNUSED(local_device);
} }
// //
@ -18186,6 +18186,20 @@ struct llama_context * llama_new_context_with_model(
if (!hparams.vocab_only) { if (!hparams.vocab_only) {
// initialize backends // initialize backends
#if defined(GGML_USE_RPC)
if (model->n_gpu_layers > 0) {
for (const auto & endpoint : model->rpc_servers) {
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
}
#endif
#if defined(GGML_USE_METAL) #if defined(GGML_USE_METAL)
if (model->n_gpu_layers > 0) { if (model->n_gpu_layers > 0) {
ctx->backend_metal = ggml_backend_metal_init(); ctx->backend_metal = ggml_backend_metal_init();
@ -18310,19 +18324,6 @@ struct llama_context * llama_new_context_with_model(
} }
#endif #endif
#if defined(GGML_USE_RPC)
if (model->n_gpu_layers > 0) {
for (const auto & endpoint : model->rpc_servers) {
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
}
#endif
ctx->backend_cpu = ggml_backend_cpu_init(); ctx->backend_cpu = ggml_backend_cpu_init();
if (ctx->backend_cpu == nullptr) { if (ctx->backend_cpu == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);