diff --git a/llama.cpp b/llama.cpp index da72bf0a5..0ca439dcd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3276,6 +3276,17 @@ struct llama_model_loader { return cur; } + size_t estimate_tensor_bytes(const std::string & name, const std::vector & ne, bool required = true) const { + const struct ggml_tensor * cur = check_tensor_dims(name, ne, required); + + if (cur == NULL) { + return 0; + } + + int64_t nbytes = ggml_nbytes(cur); + return nbytes; + } + struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, bool required = true) { const struct ggml_tensor * cur = check_tensor_dims(name, ne, required); @@ -4354,25 +4365,73 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { static int llm_determine_max_ngl(const llama_model_loader & ml, const llama_model & model, const int main_gpu) { const auto & hparams = model.hparams; - size_t available_gpu_memory = llama_get_available_device_memory(main_gpu); + // could become negative - use signed size_t + ssize_t available_gpu_memory = llama_get_available_device_memory(main_gpu); + int n_layer = hparams.n_layer; - // TODO: This is a rough, pretty inaccurate estimate, should implement using existing layer size and not guesstimating - size_t model_size = ml.n_bytes; - int32_t model_layers = hparams.n_layer; - size_t memory_per_layer = model_size / model_layers; + // "avoid a scenario where an application ooms because llama.cpp only left 5 MB of VRAM" - https://github.com/ggerganov/llama.cpp/pull/6502#discussion_r1555060962 + available_gpu_memory -= 50 * MiB; + + // determine tensor sizes + size_t total_size = 0; + size_t extra_size = 0; + std::vector layer_sizes(n_layer); + for (int i = 0; i < ml.n_tensors; ++i) { + const struct ggml_tensor * tensor = (const struct ggml_tensor *) ml.get_tensor_meta(i); + std::vector tensor_ne = { tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3] }; + size_t tensor_size = ml.estimate_tensor_bytes(tensor->name, tensor_ne); + //LLAMA_LOG_INFO("%s: %-30s: %12ld\n", __func__, tensor->name, tensor_size); + // all layer specific tensor names have the prefix "blk." + if (strncmp(tensor->name, "blk.", 4) == 0) { + int layer_no; + sscanf(tensor->name+4, "%d", &layer_no); + layer_sizes[layer_no] += tensor_size; + } else { + extra_size += tensor_size; + } + total_size += tensor_size; + } // TODO: get buffer size dynamically - int32_t buf_size = 400 * MiB; - int32_t buf_size_k = 200 * MiB; - int32_t buf_size_v = 200 * MiB; + size_t buf_size = 400 * MiB; + size_t buf_size_k = 200 * MiB; + size_t buf_size_v = 200 * MiB; - int32_t total_buf_size = buf_size + buf_size_k + buf_size_v; + ssize_t buffer_size = buf_size + buf_size_k + buf_size_v; - available_gpu_memory = available_gpu_memory - hparams.n_ctx_train; // context size - available_gpu_memory = available_gpu_memory - total_buf_size; // buffer size + // can/will be pretty big for large models + size_t n_ctx = hparams.n_ctx_train; - // Calculate the maximum number of layers that can fit into the GPU memory - int32_t max_ngl = std::floor(static_cast(available_gpu_memory) / memory_per_layer); + size_t ctx_size = + ggml_tensor_overhead()*(ml.n_tensors + 1) + + ggml_tensor_overhead()*hparams.n_expert*n_layer; + + size_t context_size = n_ctx*ctx_size; + + // Calculate the maximum number of layers that can fit into the available GPU memory + int max_ngl = 0; + ssize_t used_memory = extra_size+buffer_size+context_size; + for (int i = 0; i < n_layer; ++i) { + LLAMA_LOG_INFO("%s: layer %2d size: %12ld\n", __func__, i, layer_sizes[i]); + used_memory += layer_sizes[i]; + if (used_memory > available_gpu_memory) { + break; + } + max_ngl++; + } + + LLAMA_LOG_INFO("%s: extra size: %12ld\n", __func__, extra_size); + LLAMA_LOG_INFO("%s: ----------------------------------\n", __func__); + LLAMA_LOG_INFO("%s: total size: %12ld\n", __func__, total_size); + LLAMA_LOG_INFO("%s: available_gpu_memory: %12ld\n", __func__, available_gpu_memory); + LLAMA_LOG_INFO("%s: buffer size: %12ld\n", __func__, buffer_size); + LLAMA_LOG_INFO("%s: total context size: %12ld\n", __func__, context_size); + LLAMA_LOG_INFO("%s: used memory: %12ld\n", __func__, used_memory); + LLAMA_LOG_INFO("%s: ----------------------------------\n", __func__); + LLAMA_LOG_INFO("%s: tokens in context: %ld\n", __func__, n_ctx); + LLAMA_LOG_INFO("%s: per token context size: %ld\n", __func__, ctx_size); + LLAMA_LOG_INFO("%s: layer_count: %d\n", __func__, n_layer); + LLAMA_LOG_INFO("%s: max_ngl: %d\n", __func__, max_ngl); return max_ngl; } @@ -4394,7 +4453,7 @@ static bool llm_load_tensors( if (n_gpu_layers == -2) { n_gpu_layers = llm_determine_max_ngl(ml, model, main_gpu); - LLAMA_LOG_INFO("%s: automatically set n_gpu_layers = %d\n", __func__, n_gpu_layers); + LLAMA_LOG_INFO("%s: automatically set n_gpu_layers to %d\n", __func__, n_gpu_layers); } model.split_mode = split_mode;