fix mmap buffer management

This commit is contained in:
slaren 2024-03-21 22:06:37 +01:00
parent d8b567d254
commit 02020b0463

View file

@ -3199,6 +3199,9 @@ struct llama_model_loader {
*addr = mapping->addr;
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
const auto & w = get_weights(ggml_get_name(tensor));
if (w.idx != idx) {
continue;
}
*first = std::min(*first, w.offs);
*last = std::max(*last, w.offs + ggml_nbytes(tensor));
}
@ -3245,12 +3248,8 @@ struct llama_model_loader {
if (use_mmap && w.idx < mappings.size()) {
const auto & mapping = mappings.at(w.idx);
ggml_backend_buffer_t buf_mmap = nullptr;
if (bufs_mmap.size() > 1) {
buf_mmap = bufs_mmap.at(w.idx);
} else if (!bufs_mmap.empty()) {
buf_mmap = bufs_mmap.front();
}
ggml_backend_buffer_t buf_mmap = bufs_mmap.size() > w.idx ? bufs_mmap.at(w.idx) : nullptr;
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
if (buf_mmap && cur->data == nullptr) {
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + w.offs);
if (lmlocks) {
@ -5145,6 +5144,10 @@ static bool llm_load_tensors(
void * addr = nullptr;
size_t first, last;
ml.get_mapping_range(&first, &last, &addr, file_no, ctx);
if (first >= last) {
bufs.push_back(nullptr); // add a dummy buffer to keep the indices in sync
continue;
}
ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *)addr + first, last - first);
if (buf != nullptr) {
bufs.push_back(buf);
@ -5167,6 +5170,10 @@ static bool llm_load_tensors(
void * addr = nullptr;
size_t first, last;
ml.get_mapping_range(&first, &last, &addr, file_no, ctx);
if (first >= last) {
bufs.push_back(nullptr); // add a dummy buffer to keep the indices in sync
continue;
}
ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
if (buf != nullptr) {
bufs.push_back(buf);
@ -5196,6 +5203,9 @@ static bool llm_load_tensors(
// indicate that this buffer contains weights
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
for (ggml_backend_buffer_t buf : bufs) {
if (buf == nullptr) {
continue;
}
ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
model.bufs.push_back(buf);
}