diff --git a/llama.cpp b/llama.cpp index 2105824af..a0f917ce2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3232,7 +3232,7 @@ struct llama_model_loader { std::vector> mmaps_used; // Returns false if cancelled by progress_callback - bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::map bufs_mmap, std::vector> * lmlocks) { + bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::map & bufs_mmap, std::vector> * lmlocks) { GGML_ASSERT(size_data != 0 && "call init_mappings() first"); std::vector> read_buf; @@ -5151,16 +5151,17 @@ static bool llm_load_tensors( continue; } ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first); - if (buf != nullptr) { - bufs.emplace(idx, buf); -#ifdef GGML_USE_CUBLAS - if (n_layer >= n_gpu_layers) { - ggml_backend_cuda_register_host_buffer( - ggml_backend_buffer_get_base(buf), - ggml_backend_buffer_get_size(buf)); - } -#endif + if (buf == nullptr) { + throw std::runtime_error("unable to allocate backend CPU buffer"); } + bufs.emplace(idx, buf); +#ifdef GGML_USE_CUBLAS + if (n_layer >= n_gpu_layers) { + ggml_backend_cuda_register_host_buffer( + ggml_backend_buffer_get_base(buf), + ggml_backend_buffer_get_size(buf)); + } +#endif } } #ifdef GGML_USE_METAL @@ -5174,32 +5175,34 @@ static bool llm_load_tensors( continue; } ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size); - if (buf != nullptr) { - bufs.emplace(idx, buf); + if (buf == nullptr) { + throw std::runtime_error("unable to allocate backend metal buffer"); } + bufs.emplace(idx, buf); } } #endif else { ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); - if (buf != nullptr) { - if (use_mlock && ggml_backend_buffer_is_host(buf)) { - model.mlock_bufs.emplace_back(new llama_mlock); - auto & mlock_buf = model.mlock_bufs.back(); - mlock_buf->init(ggml_backend_buffer_get_base(buf)); - mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); - } - for (uint32_t idx = 0; idx < ml.files.size(); idx++) { - bufs.emplace(idx, buf); - } + if (buf == nullptr) { + throw std::runtime_error("unable to allocate backend buffer"); + } + if (use_mlock && ggml_backend_buffer_is_host(buf)) { + model.mlock_bufs.emplace_back(new llama_mlock); + auto & mlock_buf = model.mlock_bufs.back(); + mlock_buf->init(ggml_backend_buffer_get_base(buf)); + mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); + } + for (uint32_t idx = 0; idx < ml.files.size(); idx++) { + bufs.emplace(idx, buf); } } if (bufs.empty()) { throw std::runtime_error("failed to allocate buffer"); } - // indicate that this buffer contains weights - // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight for (auto & buf : bufs) { + // indicate that this buffer contains weights + // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); model.bufs.push_back(buf.second); }