llama_model_loader: immediately add the backend buffer to the model buffers in order to free them if an error occurs in the next allocation. Reserve the expected size.
This commit is contained in:
parent
ec372c66a4
commit
a9e88c6e57
1 changed files with 10 additions and 3 deletions
13
llama.cpp
13
llama.cpp
|
@ -5137,12 +5137,17 @@ static bool llm_load_tensors(
|
|||
ml.init_mappings(true, &model.mlock_mmaps);
|
||||
|
||||
// create the backend buffers
|
||||
std::vector<std::pair<ggml_context *, std::map<uint32_t, ggml_backend_buffer_t>>> ctx_bufs;
|
||||
std::vector<std::pair<ggml_context *, std::unordered_map<uint32_t, ggml_backend_buffer_t>>> ctx_bufs;
|
||||
|
||||
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
|
||||
size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
|
||||
model.bufs.reserve(n_max_backend_buffer);
|
||||
|
||||
for (auto & it : ctx_map) {
|
||||
ggml_backend_buffer_type_t buft = it.first;
|
||||
ggml_context * ctx = it.second;
|
||||
std::map<uint32_t, ggml_backend_buffer_t> bufs;
|
||||
std::unordered_map<uint32_t, ggml_backend_buffer_t> bufs;
|
||||
bufs.reserve(n_max_backend_buffer);
|
||||
|
||||
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
||||
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
||||
|
@ -5159,6 +5164,7 @@ static bool llm_load_tensors(
|
|||
if (buf == nullptr) {
|
||||
throw std::runtime_error("unable to allocate backend CPU buffer");
|
||||
}
|
||||
model.bufs.push_back(buf);
|
||||
bufs.emplace(idx, buf);
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
if (n_layer >= n_gpu_layers) {
|
||||
|
@ -5183,6 +5189,7 @@ static bool llm_load_tensors(
|
|||
if (buf == nullptr) {
|
||||
throw std::runtime_error("unable to allocate backend metal buffer");
|
||||
}
|
||||
model.bufs.push_back(buf);
|
||||
bufs.emplace(idx, buf);
|
||||
}
|
||||
}
|
||||
|
@ -5192,6 +5199,7 @@ static bool llm_load_tensors(
|
|||
if (buf == nullptr) {
|
||||
throw std::runtime_error("unable to allocate backend buffer");
|
||||
}
|
||||
model.bufs.push_back(buf);
|
||||
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
|
||||
model.mlock_bufs.emplace_back(new llama_mlock);
|
||||
auto & mlock_buf = model.mlock_bufs.back();
|
||||
|
@ -5209,7 +5217,6 @@ static bool llm_load_tensors(
|
|||
// indicate that this buffer contains weights
|
||||
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
|
||||
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
model.bufs.push_back(buf.second);
|
||||
}
|
||||
|
||||
ctx_bufs.emplace_back(ctx, bufs);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue