add multi-gpu support for ngl calculations
This commit is contained in:
parent
1589e524c7
commit
48fbf8ca1a
1 changed files with 13 additions and 3 deletions
16
llama.cpp
16
llama.cpp
|
@ -4362,13 +4362,23 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
||||||
}
|
}
|
||||||
|
|
||||||
static int llm_determine_max_ngl(const llama_model_loader & ml, const llama_model & model, const int main_gpu) {
|
static int llm_determine_max_ngl(const llama_model_loader & ml, const llama_model & model, const int main_gpu, enum llama_split_mode split_mode) {
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
// could become negative - use signed size_t
|
// could become negative - use signed size_t
|
||||||
ssize_t available_gpu_memory = llama_get_available_device_memory(main_gpu);
|
ssize_t available_gpu_memory = 0;
|
||||||
int n_layer = hparams.n_layer;
|
int n_layer = hparams.n_layer;
|
||||||
|
|
||||||
|
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
||||||
|
// veeery sketchy, there has to be a better way to do this
|
||||||
|
int deice_count = llama_get_device_count();
|
||||||
|
for (int i = 0; i < deice_count; ++i) {
|
||||||
|
available_gpu_memory += llama_get_available_device_memory(i);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
available_gpu_memory = llama_get_available_device_memory(main_gpu);
|
||||||
|
}
|
||||||
|
|
||||||
// "avoid a scenario where an application ooms because llama.cpp only left 5 MB of VRAM" - https://github.com/ggerganov/llama.cpp/pull/6502#discussion_r1555060962
|
// "avoid a scenario where an application ooms because llama.cpp only left 5 MB of VRAM" - https://github.com/ggerganov/llama.cpp/pull/6502#discussion_r1555060962
|
||||||
available_gpu_memory -= 50 * MiB;
|
available_gpu_memory -= 50 * MiB;
|
||||||
|
|
||||||
|
@ -4452,7 +4462,7 @@ static bool llm_load_tensors(
|
||||||
auto & hparams = model.hparams;
|
auto & hparams = model.hparams;
|
||||||
|
|
||||||
if (n_gpu_layers == -2) {
|
if (n_gpu_layers == -2) {
|
||||||
n_gpu_layers = llm_determine_max_ngl(ml, model, main_gpu);
|
n_gpu_layers = llm_determine_max_ngl(ml, model, main_gpu, split_mode);
|
||||||
LLAMA_LOG_INFO("%s: automatically set n_gpu_layers to %d\n", __func__, n_gpu_layers);
|
LLAMA_LOG_INFO("%s: automatically set n_gpu_layers to %d\n", __func__, n_gpu_layers);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue