implement automatic max ngl detection

This commit is contained in:
Yui 2024-04-05 11:59:38 +02:00
parent a307375c02
commit 1e66c3a7b2
5 changed files with 68 additions and 3 deletions

View file

@ -836,7 +836,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
invalid_param = true; invalid_param = true;
return true; return true;
} }
params.n_gpu_layers = std::stoi(argv[i]); std::string argValue = argv[i];
if (argValue == "auto" || argValue == "a") {
params.n_gpu_layers = -2;
} else {
params.n_gpu_layers = std::stoi(argValue);
}
if (!llama_supports_gpu_offload()) { if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
@ -1407,6 +1412,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
if (llama_supports_gpu_offload()) { if (llama_supports_gpu_offload()) {
printf(" -ngl N, --n-gpu-layers N\n"); printf(" -ngl N, --n-gpu-layers N\n");
printf(" number of layers to store in VRAM\n"); printf(" number of layers to store in VRAM\n");
printf(" set to 'auto' or 'a' to determine max automatically based on VRAM size\n");
printf(" -ngld N, --n-gpu-layers-draft N\n"); printf(" -ngld N, --n-gpu-layers-draft N\n");
printf(" number of layers to store in VRAM for the draft model\n"); printf(" number of layers to store in VRAM for the draft model\n");
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
@ -2480,7 +2486,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str()); fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers); fprintf(stream, "n_gpu_layers: %d # default: -1, auto: -2\n", params.n_gpu_layers);
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs); fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");

View file

@ -62,7 +62,7 @@ struct gpt_params {
int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode int32_t n_sequences = 1; // number of sequences to decode
float p_split = 0.1f; // speculative decoding split probability float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default, -2 - automatically determine)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors

View file

@ -2612,6 +2612,11 @@ GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, si
CUDA_CHECK(cudaMemGetInfo(free, total)); CUDA_CHECK(cudaMemGetInfo(free, total));
} }
GGML_CALL void ggml_backend_cuda_get_free_device_memory(int device, size_t * free) {
size_t total;
ggml_backend_cuda_get_device_memory(device, free, &total);
}
GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) { GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) { if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
return false; return false;

View file

@ -34,6 +34,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type
GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void); GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
GGML_API GGML_CALL void ggml_backend_cuda_get_free_device_memory(int device, size_t * free);
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size); GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer); GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);

View file

@ -1648,6 +1648,28 @@ static size_t llama_get_device_memory(int device) {
#endif #endif
} }
// TODO: implement for other backends to return free memory
static size_t llama_get_available_device_memory(int device) {
#if defined(GGML_USE_CUDA)
size_t free;
ggml_backend_cuda_get_free_device_memory(device, &free);
return free;
#elif defined(GGML_USE_SYCL)
size_t total;
size_t free;
ggml_backend_sycl_get_device_memory(device, &total, &free);
return free;
#elif defined(GGML_USE_VULKAN)
size_t total;
size_t free;
ggml_backend_vk_get_device_memory(device, &total, &free);
return free;
#else
return 1;
GGML_UNUSED(device);
#endif
}
// //
// globals // globals
// //
@ -4327,6 +4349,32 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
} }
static int llm_determine_max_ngl(const llama_model_loader & ml, const llama_model & model, const int main_gpu) {
const auto & hparams = model.hparams;
size_t available_gpu_memory = llama_get_available_device_memory(main_gpu);
// TODO: This is a rough, pretty inaccurate estimate, should implement using existing layer size and not guesstimating
size_t model_size = ml.n_bytes;
int32_t model_layers = hparams.n_layer;
size_t memory_per_layer = model_size / model_layers;
// TODO: get buffer size dynamically
int32_t buf_size = 400 * MiB;
int32_t buf_size_k = 200 * MiB;
int32_t buf_size_v = 200 * MiB;
int32_t total_buf_size = buf_size + buf_size_k + buf_size_v;
available_gpu_memory = available_gpu_memory - hparams.n_ctx_train; // context size
available_gpu_memory = available_gpu_memory - total_buf_size; // buffer size
// Calculate the maximum number of layers that can fit into the GPU memory
int32_t max_ngl = std::floor(static_cast<float>(available_gpu_memory) / memory_per_layer);
return max_ngl;
}
// Returns false if cancelled by progress_callback // Returns false if cancelled by progress_callback
static bool llm_load_tensors( static bool llm_load_tensors(
llama_model_loader & ml, llama_model_loader & ml,
@ -4342,6 +4390,11 @@ static bool llm_load_tensors(
auto & hparams = model.hparams; auto & hparams = model.hparams;
if (n_gpu_layers == -2) {
n_gpu_layers = llm_determine_max_ngl(ml, model, main_gpu);
LLAMA_LOG_INFO("%s: automatically set n_gpu_layers = %d\n", __func__, n_gpu_layers);
}
model.split_mode = split_mode; model.split_mode = split_mode;
model.main_gpu = main_gpu; model.main_gpu = main_gpu;
model.n_gpu_layers = n_gpu_layers; model.n_gpu_layers = n_gpu_layers;