diff --git a/llama.cpp b/llama.cpp index cd20ad7a4..53b5a0608 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3199,6 +3199,9 @@ struct llama_model_loader { *addr = mapping->addr; for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { const auto & w = get_weights(ggml_get_name(tensor)); + if (w.idx != idx) { + continue; + } *first = std::min(*first, w.offs); *last = std::max(*last, w.offs + ggml_nbytes(tensor)); } @@ -3245,12 +3248,8 @@ struct llama_model_loader { if (use_mmap && w.idx < mappings.size()) { const auto & mapping = mappings.at(w.idx); - ggml_backend_buffer_t buf_mmap = nullptr; - if (bufs_mmap.size() > 1) { - buf_mmap = bufs_mmap.at(w.idx); - } else if (!bufs_mmap.empty()) { - buf_mmap = bufs_mmap.front(); - } + ggml_backend_buffer_t buf_mmap = bufs_mmap.size() > w.idx ? bufs_mmap.at(w.idx) : nullptr; + GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated if (buf_mmap && cur->data == nullptr) { ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + w.offs); if (lmlocks) { @@ -5145,6 +5144,10 @@ static bool llm_load_tensors( void * addr = nullptr; size_t first, last; ml.get_mapping_range(&first, &last, &addr, file_no, ctx); + if (first >= last) { + bufs.push_back(nullptr); // add a dummy buffer to keep the indices in sync + continue; + } ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *)addr + first, last - first); if (buf != nullptr) { bufs.push_back(buf); @@ -5167,6 +5170,10 @@ static bool llm_load_tensors( void * addr = nullptr; size_t first, last; ml.get_mapping_range(&first, &last, &addr, file_no, ctx); + if (first >= last) { + bufs.push_back(nullptr); // add a dummy buffer to keep the indices in sync + continue; + } ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size); if (buf != nullptr) { bufs.push_back(buf); @@ -5196,6 +5203,9 @@ static bool llm_load_tensors( // indicate that this buffer contains weights // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight for (ggml_backend_buffer_t buf : bufs) { + if (buf == nullptr) { + continue; + } ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); model.bufs.push_back(buf); }