From 078a1aca0648204c2abaec097b04c1bac8cf3795 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Thu, 21 Mar 2024 21:33:14 +0100 Subject: [PATCH] llama_model_loader: map file to backend buffer if the allocation succeeds only --- llama.cpp | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/llama.cpp b/llama.cpp index 53b5a0608..a7945ef09 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3192,7 +3192,7 @@ struct llama_model_loader { void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const { GGML_ASSERT(!mappings.empty()); - const auto & mapping = mappings[idx]; + const auto & mapping = mappings.at(idx); *first = mapping->size; *last = 0; @@ -3211,7 +3211,7 @@ struct llama_model_loader { void load_data_for(struct ggml_tensor * cur) const { const auto & w = get_weights(ggml_get_name(cur)); - if (use_mmap && w.idx < mappings.size()) { + if (use_mmap) { const auto & mapping = mappings.at(w.idx); if (cur->data == nullptr) { cur->data = (uint8_t *)mapping->addr + w.offs; @@ -3232,7 +3232,7 @@ struct llama_model_loader { std::vector> mmaps_used; // Returns false if cancelled by progress_callback - bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::vector bufs_mmap, std::vector> * lmlocks) { + bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::map bufs_mmap, std::vector> * lmlocks) { GGML_ASSERT(size_data != 0 && "call init_mappings() first"); std::vector> read_buf; @@ -3246,9 +3246,12 @@ struct llama_model_loader { const auto & w = get_weights(ggml_get_name(cur)); size_t n_size = ggml_nbytes(cur); - if (use_mmap && w.idx < mappings.size()) { + if (use_mmap) { const auto & mapping = mappings.at(w.idx); - ggml_backend_buffer_t buf_mmap = bufs_mmap.size() > w.idx ? bufs_mmap.at(w.idx) : nullptr; + ggml_backend_buffer_t buf_mmap = nullptr; + if (bufs_mmap.count(w.idx)) { + buf_mmap = bufs_mmap.at(w.idx); + } GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated if (buf_mmap && cur->data == nullptr) { ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + w.offs); @@ -3283,7 +3286,7 @@ struct llama_model_loader { // check if this is the last call and do final cleanup if (size_done >= size_data) { // unmap offloaded tensors and metadata - if (use_mmap && !mappings.empty()) { + if (use_mmap) { for (uint32_t file_no = 0; file_no < mappings.size(); file_no++) { const auto & mmap_used = mmaps_used[file_no]; auto & mapping = mappings.at(file_no); @@ -5129,12 +5132,12 @@ static bool llm_load_tensors( ml.init_mappings(true, &model.mlock_mmaps); // create the backend buffers - std::vector>> ctx_bufs; + std::vector>> ctx_bufs; for (auto & it : ctx_map) { ggml_backend_buffer_type_t buft = it.first; ggml_context * ctx = it.second; - std::vector bufs; + std::map bufs; // only the mmap region containing the tensors in the model is mapped to the backend buffer // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers @@ -5145,12 +5148,11 @@ static bool llm_load_tensors( size_t first, last; ml.get_mapping_range(&first, &last, &addr, file_no, ctx); if (first >= last) { - bufs.push_back(nullptr); // add a dummy buffer to keep the indices in sync continue; } ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *)addr + first, last - first); if (buf != nullptr) { - bufs.push_back(buf); + bufs.emplace(file_no, buf); #ifdef GGML_USE_CUBLAS if (n_layer >= n_gpu_layers) { ggml_backend_cuda_register_host_buffer( @@ -5158,8 +5160,6 @@ static bool llm_load_tensors( ggml_backend_buffer_get_size(buf)); } #endif - } else { - throw std::runtime_error("failed to allocate cpu buffer"); } } } @@ -5176,9 +5176,7 @@ static bool llm_load_tensors( } ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size); if (buf != nullptr) { - bufs.push_back(buf); - } else { - throw std::runtime_error("failed to allocate metal buffer"); + bufs.emplace(file_no, buf); } } } @@ -5192,9 +5190,9 @@ static bool llm_load_tensors( mlock_buf->init(ggml_backend_buffer_get_base(buf)); mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); } - bufs.push_back(buf); - } else { - throw std::runtime_error("failed to allocate backend buffer"); + for (uint32_t file_no = 0; file_no < ml.files.size(); file_no++) { + bufs.emplace(file_no, buf); + } } } if (bufs.empty()) { @@ -5202,12 +5200,9 @@ static bool llm_load_tensors( } // indicate that this buffer contains weights // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight - for (ggml_backend_buffer_t buf : bufs) { - if (buf == nullptr) { - continue; - } - ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - model.bufs.push_back(buf); + for (auto & buf : bufs) { + ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + model.bufs.push_back(buf.second); } ctx_bufs.emplace_back(ctx, bufs);