From a9e88c6e57311b36a7f0e62c65b0ee2420fced1b Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Fri, 22 Mar 2024 06:59:04 +0100 Subject: [PATCH] llama_model_loader: immediately add the backend buffer to the model buffers in order to free them if an error occurs in the next allocation. Reserve the expected size. --- llama.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index decb895f3..891892f25 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5137,12 +5137,17 @@ static bool llm_load_tensors( ml.init_mappings(true, &model.mlock_mmaps); // create the backend buffers - std::vector>> ctx_bufs; + std::vector>> ctx_bufs; + + // Ensure we have enough capacity for the maximum backend buffer we will potentially create + size_t n_max_backend_buffer = ctx_map.size() * ml.files.size(); + model.bufs.reserve(n_max_backend_buffer); for (auto & it : ctx_map) { ggml_backend_buffer_type_t buft = it.first; ggml_context * ctx = it.second; - std::map bufs; + std::unordered_map bufs; + bufs.reserve(n_max_backend_buffer); // only the mmap region containing the tensors in the model is mapped to the backend buffer // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers @@ -5159,6 +5164,7 @@ static bool llm_load_tensors( if (buf == nullptr) { throw std::runtime_error("unable to allocate backend CPU buffer"); } + model.bufs.push_back(buf); bufs.emplace(idx, buf); #ifdef GGML_USE_CUBLAS if (n_layer >= n_gpu_layers) { @@ -5183,6 +5189,7 @@ static bool llm_load_tensors( if (buf == nullptr) { throw std::runtime_error("unable to allocate backend metal buffer"); } + model.bufs.push_back(buf); bufs.emplace(idx, buf); } } @@ -5192,6 +5199,7 @@ static bool llm_load_tensors( if (buf == nullptr) { throw std::runtime_error("unable to allocate backend buffer"); } + model.bufs.push_back(buf); if (use_mlock && ggml_backend_buffer_is_host(buf)) { model.mlock_bufs.emplace_back(new llama_mlock); auto & mlock_buf = model.mlock_bufs.back(); @@ -5209,7 +5217,6 @@ static bool llm_load_tensors( // indicate that this buffer contains weights // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - model.bufs.push_back(buf.second); } ctx_bufs.emplace_back(ctx, bufs);