llama : offload to RPC in addition to other backends (#7640)

* llama : offload to RPC in addition to other backends

* - fix copy_tensor being called on the src buffer instead of the dst buffer

- always initialize views in the view_src buffer

- add RPC backend to Makefile build

- add endpoint to all RPC object names

* add rpc-server to Makefile

* Update llama.cpp

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
Radoslav Gerganov 2024-06-03 20:03:26 +03:00 committed by GitHub
parent a5735e4426
commit bde7cd3cd9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 86 additions and 53 deletions

View file

@ -2371,13 +2371,34 @@ struct llama_context {
struct llama_control_vector cvec;
};
static size_t llama_get_device_count(const llama_model & model) {
size_t count = 1;
#if defined(GGML_USE_CUDA)
count = ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL)
count = ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
count = ggml_backend_vk_get_device_count();
#endif
#if defined(GGML_USE_RPC)
count += model.rpc_servers.size();
#endif
return count;
GGML_UNUSED(model);
}
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
ggml_backend_buffer_type_t buft = nullptr;
#ifdef GGML_USE_RPC
std::string endpoint = model.rpc_servers[gpu];
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
#elif defined(GGML_USE_METAL)
#if defined(GGML_USE_RPC)
int dev_count = (int)llama_get_device_count(model);
int rpc_count = (int)model.rpc_servers.size();
if (gpu >= dev_count - rpc_count) {
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
return ggml_backend_rpc_buffer_type(endpoint);
}
#endif
#if defined(GGML_USE_METAL)
buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUDA)
buft = ggml_backend_cuda_buffer_type(gpu);
@ -2425,29 +2446,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
GGML_UNUSED(tensor_split);
}
static size_t llama_get_device_count(const llama_model & model) {
#if defined(GGML_USE_RPC)
return model.rpc_servers.size();
#elif defined(GGML_USE_CUDA)
return ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL)
return ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
return ggml_backend_vk_get_device_count();
#else
return 1;
#endif
GGML_UNUSED(model);
}
static size_t llama_get_device_memory(const llama_model & model, int device) {
#if defined(GGML_USE_RPC)
size_t total;
size_t free;
std::string endpoint = model.rpc_servers[device];
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
return free;
#elif defined(GGML_USE_CUDA)
int dev_count = (int)llama_get_device_count(model);
int rpc_count = (int)model.rpc_servers.size();
if (device >= dev_count - rpc_count) {
size_t total;
size_t free;
const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
return free;
}
#endif
#if defined(GGML_USE_CUDA)
size_t total;
size_t free;
ggml_backend_cuda_get_device_memory(device, &free, &total);
@ -16160,7 +16171,7 @@ struct llama_model * llama_load_model_from_file(
return true;
};
}
if (params.rpc_servers != nullptr) {
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
// split the servers set them into model->rpc_servers
std::string servers(params.rpc_servers);
size_t pos = 0;
@ -16323,17 +16334,7 @@ struct llama_context * llama_new_context_with_model(
if (!hparams.vocab_only) {
// initialize backends
#if defined(GGML_USE_RPC)
for (auto & server : model->rpc_servers) {
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
#elif defined(GGML_USE_METAL)
#if defined(GGML_USE_METAL)
if (model->n_gpu_layers > 0) {
ctx->backend_metal = ggml_backend_metal_init();
if (ctx->backend_metal == nullptr) {
@ -16425,6 +16426,19 @@ struct llama_context * llama_new_context_with_model(
}
ctx->backends.push_back(backend);
}
#endif
#if defined(GGML_USE_RPC)
if (model->n_gpu_layers > 0) {
for (const auto & endpoint : model->rpc_servers) {
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
}
#endif
ctx->backend_cpu = ggml_backend_cpu_init();
if (ctx->backend_cpu == nullptr) {