llama : offload to RPC in addition to other backends
This commit is contained in:
parent
7846540bd2
commit
6c276deb9d
2 changed files with 53 additions and 38 deletions
|
@ -321,7 +321,9 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
||||||
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
|
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
|
||||||
} else if (ggml_backend_buffer_is_host(dst->buffer)) {
|
} else if (ggml_backend_buffer_is_host(dst->buffer)) {
|
||||||
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
||||||
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
|
}
|
||||||
|
bool same_backend = strcmp(ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer)) == 0;
|
||||||
|
if (!same_backend || !ggml_backend_buffer_copy_tensor(src, dst)) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
|
fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
|
||||||
#endif
|
#endif
|
||||||
|
|
81
llama.cpp
81
llama.cpp
|
@ -2369,13 +2369,34 @@ struct llama_context {
|
||||||
struct llama_control_vector cvec;
|
struct llama_control_vector cvec;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static size_t llama_get_device_count(const llama_model & model) {
|
||||||
|
size_t count = 1;
|
||||||
|
#if defined(GGML_USE_CUDA)
|
||||||
|
count = ggml_backend_cuda_get_device_count();
|
||||||
|
#elif defined(GGML_USE_SYCL)
|
||||||
|
count = ggml_backend_sycl_get_device_count();
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
count = ggml_backend_vk_get_device_count();
|
||||||
|
#endif
|
||||||
|
#if defined(GGML_USE_RPC)
|
||||||
|
count += model.rpc_servers.size();
|
||||||
|
#endif
|
||||||
|
return count;
|
||||||
|
GGML_UNUSED(model);
|
||||||
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
||||||
ggml_backend_buffer_type_t buft = nullptr;
|
ggml_backend_buffer_type_t buft = nullptr;
|
||||||
|
|
||||||
#ifdef GGML_USE_RPC
|
#if defined(GGML_USE_RPC)
|
||||||
std::string endpoint = model.rpc_servers[gpu];
|
int dev_count = (int)llama_get_device_count(model);
|
||||||
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
|
int rpc_count = (int)model.rpc_servers.size();
|
||||||
#elif defined(GGML_USE_METAL)
|
if (gpu >= dev_count - rpc_count) {
|
||||||
|
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
|
||||||
|
return ggml_backend_rpc_buffer_type(endpoint);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#if defined(GGML_USE_METAL)
|
||||||
buft = ggml_backend_metal_buffer_type();
|
buft = ggml_backend_metal_buffer_type();
|
||||||
#elif defined(GGML_USE_CUDA)
|
#elif defined(GGML_USE_CUDA)
|
||||||
buft = ggml_backend_cuda_buffer_type(gpu);
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
||||||
|
@ -2423,29 +2444,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
|
||||||
GGML_UNUSED(tensor_split);
|
GGML_UNUSED(tensor_split);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t llama_get_device_count(const llama_model & model) {
|
|
||||||
#if defined(GGML_USE_RPC)
|
|
||||||
return model.rpc_servers.size();
|
|
||||||
#elif defined(GGML_USE_CUDA)
|
|
||||||
return ggml_backend_cuda_get_device_count();
|
|
||||||
#elif defined(GGML_USE_SYCL)
|
|
||||||
return ggml_backend_sycl_get_device_count();
|
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
return ggml_backend_vk_get_device_count();
|
|
||||||
#else
|
|
||||||
return 1;
|
|
||||||
#endif
|
|
||||||
GGML_UNUSED(model);
|
|
||||||
}
|
|
||||||
|
|
||||||
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
||||||
#if defined(GGML_USE_RPC)
|
#if defined(GGML_USE_RPC)
|
||||||
|
int dev_count = (int)llama_get_device_count(model);
|
||||||
|
int rpc_count = (int)model.rpc_servers.size();
|
||||||
|
if (device >= dev_count - rpc_count) {
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
std::string endpoint = model.rpc_servers[device];
|
const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
|
||||||
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
|
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
||||||
return free;
|
return free;
|
||||||
#elif defined(GGML_USE_CUDA)
|
}
|
||||||
|
#endif
|
||||||
|
#if defined(GGML_USE_CUDA)
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
||||||
|
@ -16146,7 +16157,7 @@ struct llama_model * llama_load_model_from_file(
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
if (params.rpc_servers != nullptr) {
|
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
|
||||||
// split the servers set them into model->rpc_servers
|
// split the servers set them into model->rpc_servers
|
||||||
std::string servers(params.rpc_servers);
|
std::string servers(params.rpc_servers);
|
||||||
size_t pos = 0;
|
size_t pos = 0;
|
||||||
|
@ -16304,17 +16315,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
|
|
||||||
if (!hparams.vocab_only) {
|
if (!hparams.vocab_only) {
|
||||||
// initialize backends
|
// initialize backends
|
||||||
#if defined(GGML_USE_RPC)
|
#if defined(GGML_USE_METAL)
|
||||||
for (auto & server : model->rpc_servers) {
|
|
||||||
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
|
|
||||||
if (backend == nullptr) {
|
|
||||||
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
|
||||||
llama_free(ctx);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
ctx->backends.push_back(backend);
|
|
||||||
}
|
|
||||||
#elif defined(GGML_USE_METAL)
|
|
||||||
if (model->n_gpu_layers > 0) {
|
if (model->n_gpu_layers > 0) {
|
||||||
ctx->backend_metal = ggml_backend_metal_init();
|
ctx->backend_metal = ggml_backend_metal_init();
|
||||||
if (ctx->backend_metal == nullptr) {
|
if (ctx->backend_metal == nullptr) {
|
||||||
|
@ -16406,6 +16407,18 @@ struct llama_context * llama_new_context_with_model(
|
||||||
}
|
}
|
||||||
ctx->backends.push_back(backend);
|
ctx->backends.push_back(backend);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#if defined(GGML_USE_RPC)
|
||||||
|
for (int i = 0; i < (int)model->rpc_servers.size(); i++) {
|
||||||
|
const char * endpoint = model->rpc_servers[i].c_str();
|
||||||
|
ggml_backend_t backend = ggml_backend_rpc_init(endpoint);
|
||||||
|
if (backend == nullptr) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint);
|
||||||
|
llama_free(ctx);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
ctx->backends.push_back(backend);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
ctx->backend_cpu = ggml_backend_cpu_init();
|
||||||
if (ctx->backend_cpu == nullptr) {
|
if (ctx->backend_cpu == nullptr) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue