calculate max ngl by looking at block sizes
This commit is contained in:
parent
a6120326b2
commit
1589e524c7
1 changed files with 73 additions and 14 deletions
87
llama.cpp
87
llama.cpp
|
@ -3276,6 +3276,17 @@ struct llama_model_loader {
|
||||||
return cur;
|
return cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t estimate_tensor_bytes(const std::string & name, const std::vector<int64_t> & ne, bool required = true) const {
|
||||||
|
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
||||||
|
|
||||||
|
if (cur == NULL) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t nbytes = ggml_nbytes(cur);
|
||||||
|
return nbytes;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
|
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
|
||||||
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
||||||
|
|
||||||
|
@ -4354,25 +4365,73 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
static int llm_determine_max_ngl(const llama_model_loader & ml, const llama_model & model, const int main_gpu) {
|
static int llm_determine_max_ngl(const llama_model_loader & ml, const llama_model & model, const int main_gpu) {
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
size_t available_gpu_memory = llama_get_available_device_memory(main_gpu);
|
// could become negative - use signed size_t
|
||||||
|
ssize_t available_gpu_memory = llama_get_available_device_memory(main_gpu);
|
||||||
|
int n_layer = hparams.n_layer;
|
||||||
|
|
||||||
// TODO: This is a rough, pretty inaccurate estimate, should implement using existing layer size and not guesstimating
|
// "avoid a scenario where an application ooms because llama.cpp only left 5 MB of VRAM" - https://github.com/ggerganov/llama.cpp/pull/6502#discussion_r1555060962
|
||||||
size_t model_size = ml.n_bytes;
|
available_gpu_memory -= 50 * MiB;
|
||||||
int32_t model_layers = hparams.n_layer;
|
|
||||||
size_t memory_per_layer = model_size / model_layers;
|
// determine tensor sizes
|
||||||
|
size_t total_size = 0;
|
||||||
|
size_t extra_size = 0;
|
||||||
|
std::vector<size_t> layer_sizes(n_layer);
|
||||||
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
||||||
|
const struct ggml_tensor * tensor = (const struct ggml_tensor *) ml.get_tensor_meta(i);
|
||||||
|
std::vector<int64_t> tensor_ne = { tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3] };
|
||||||
|
size_t tensor_size = ml.estimate_tensor_bytes(tensor->name, tensor_ne);
|
||||||
|
//LLAMA_LOG_INFO("%s: %-30s: %12ld\n", __func__, tensor->name, tensor_size);
|
||||||
|
// all layer specific tensor names have the prefix "blk."
|
||||||
|
if (strncmp(tensor->name, "blk.", 4) == 0) {
|
||||||
|
int layer_no;
|
||||||
|
sscanf(tensor->name+4, "%d", &layer_no);
|
||||||
|
layer_sizes[layer_no] += tensor_size;
|
||||||
|
} else {
|
||||||
|
extra_size += tensor_size;
|
||||||
|
}
|
||||||
|
total_size += tensor_size;
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: get buffer size dynamically
|
// TODO: get buffer size dynamically
|
||||||
int32_t buf_size = 400 * MiB;
|
size_t buf_size = 400 * MiB;
|
||||||
int32_t buf_size_k = 200 * MiB;
|
size_t buf_size_k = 200 * MiB;
|
||||||
int32_t buf_size_v = 200 * MiB;
|
size_t buf_size_v = 200 * MiB;
|
||||||
|
|
||||||
int32_t total_buf_size = buf_size + buf_size_k + buf_size_v;
|
ssize_t buffer_size = buf_size + buf_size_k + buf_size_v;
|
||||||
|
|
||||||
available_gpu_memory = available_gpu_memory - hparams.n_ctx_train; // context size
|
// can/will be pretty big for large models
|
||||||
available_gpu_memory = available_gpu_memory - total_buf_size; // buffer size
|
size_t n_ctx = hparams.n_ctx_train;
|
||||||
|
|
||||||
// Calculate the maximum number of layers that can fit into the GPU memory
|
size_t ctx_size =
|
||||||
int32_t max_ngl = std::floor(static_cast<float>(available_gpu_memory) / memory_per_layer);
|
ggml_tensor_overhead()*(ml.n_tensors + 1) +
|
||||||
|
ggml_tensor_overhead()*hparams.n_expert*n_layer;
|
||||||
|
|
||||||
|
size_t context_size = n_ctx*ctx_size;
|
||||||
|
|
||||||
|
// Calculate the maximum number of layers that can fit into the available GPU memory
|
||||||
|
int max_ngl = 0;
|
||||||
|
ssize_t used_memory = extra_size+buffer_size+context_size;
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
LLAMA_LOG_INFO("%s: layer %2d size: %12ld\n", __func__, i, layer_sizes[i]);
|
||||||
|
used_memory += layer_sizes[i];
|
||||||
|
if (used_memory > available_gpu_memory) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
max_ngl++;
|
||||||
|
}
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("%s: extra size: %12ld\n", __func__, extra_size);
|
||||||
|
LLAMA_LOG_INFO("%s: ----------------------------------\n", __func__);
|
||||||
|
LLAMA_LOG_INFO("%s: total size: %12ld\n", __func__, total_size);
|
||||||
|
LLAMA_LOG_INFO("%s: available_gpu_memory: %12ld\n", __func__, available_gpu_memory);
|
||||||
|
LLAMA_LOG_INFO("%s: buffer size: %12ld\n", __func__, buffer_size);
|
||||||
|
LLAMA_LOG_INFO("%s: total context size: %12ld\n", __func__, context_size);
|
||||||
|
LLAMA_LOG_INFO("%s: used memory: %12ld\n", __func__, used_memory);
|
||||||
|
LLAMA_LOG_INFO("%s: ----------------------------------\n", __func__);
|
||||||
|
LLAMA_LOG_INFO("%s: tokens in context: %ld\n", __func__, n_ctx);
|
||||||
|
LLAMA_LOG_INFO("%s: per token context size: %ld\n", __func__, ctx_size);
|
||||||
|
LLAMA_LOG_INFO("%s: layer_count: %d\n", __func__, n_layer);
|
||||||
|
LLAMA_LOG_INFO("%s: max_ngl: %d\n", __func__, max_ngl);
|
||||||
|
|
||||||
return max_ngl;
|
return max_ngl;
|
||||||
}
|
}
|
||||||
|
@ -4394,7 +4453,7 @@ static bool llm_load_tensors(
|
||||||
|
|
||||||
if (n_gpu_layers == -2) {
|
if (n_gpu_layers == -2) {
|
||||||
n_gpu_layers = llm_determine_max_ngl(ml, model, main_gpu);
|
n_gpu_layers = llm_determine_max_ngl(ml, model, main_gpu);
|
||||||
LLAMA_LOG_INFO("%s: automatically set n_gpu_layers = %d\n", __func__, n_gpu_layers);
|
LLAMA_LOG_INFO("%s: automatically set n_gpu_layers to %d\n", __func__, n_gpu_layers);
|
||||||
}
|
}
|
||||||
|
|
||||||
model.split_mode = split_mode;
|
model.split_mode = split_mode;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue