llama_model_loader: map file to backend buffer if the allocation succeeds only

2024-03-21 21:33:14 +01:00 · 2024-03-21 21:33:14 +01:00 · 078a1aca06
commit 078a1aca06
parent 02020b0463
1 changed files with 19 additions and 24 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -3192,7 +3192,7 @@ struct llama_model_loader {

    void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
        GGML_ASSERT(!mappings.empty());
-        const auto & mapping = mappings[idx];
+        const auto & mapping = mappings.at(idx);

        *first = mapping->size;
        *last  = 0;
@ -3211,7 +3211,7 @@ struct llama_model_loader {
    void load_data_for(struct ggml_tensor * cur) const {
        const auto & w = get_weights(ggml_get_name(cur));

-        if (use_mmap && w.idx < mappings.size()) {
+        if (use_mmap) {
            const auto & mapping = mappings.at(w.idx);
            if (cur->data == nullptr) {
                cur->data = (uint8_t *)mapping->addr + w.offs;
@ -3232,7 +3232,7 @@ struct llama_model_loader {
    std::vector<std::pair<size_t, size_t>> mmaps_used;

    // Returns false if cancelled by progress_callback
-    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::vector<ggml_backend_buffer_t> bufs_mmap, std::vector<std::unique_ptr<llama_mlock>> * lmlocks) {
+    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::map<uint32_t, ggml_backend_buffer *> bufs_mmap, std::vector<std::unique_ptr<llama_mlock>> * lmlocks) {
        GGML_ASSERT(size_data != 0 && "call init_mappings() first");

        std::vector<no_init<uint8_t>> read_buf;
@ -3246,9 +3246,12 @@ struct llama_model_loader {
            const auto & w = get_weights(ggml_get_name(cur));
            size_t n_size = ggml_nbytes(cur);

-            if (use_mmap && w.idx < mappings.size()) {
+            if (use_mmap) {
                const auto & mapping = mappings.at(w.idx);
-                ggml_backend_buffer_t buf_mmap = bufs_mmap.size() > w.idx ? bufs_mmap.at(w.idx) : nullptr;
+                ggml_backend_buffer_t buf_mmap = nullptr;
+                if (bufs_mmap.count(w.idx)) {
+                    buf_mmap = bufs_mmap.at(w.idx);
+                }
                GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
                if (buf_mmap && cur->data == nullptr) {
                    ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + w.offs);
@ -3283,7 +3286,7 @@ struct llama_model_loader {
        // check if this is the last call and do final cleanup
        if (size_done >= size_data) {
            // unmap offloaded tensors and metadata
-            if (use_mmap && !mappings.empty()) {
+            if (use_mmap) {
                for (uint32_t file_no = 0; file_no < mappings.size(); file_no++) {
                    const auto & mmap_used = mmaps_used[file_no];
                    auto & mapping = mappings.at(file_no);
@ -5129,12 +5132,12 @@ static bool llm_load_tensors(
    ml.init_mappings(true, &model.mlock_mmaps);

    // create the backend buffers
-    std::vector<std::pair<ggml_context *, std::vector<ggml_backend_buffer_t>>> ctx_bufs;
+    std::vector<std::pair<ggml_context *, std::map<uint32_t, ggml_backend_buffer_t>>> ctx_bufs;

    for (auto & it : ctx_map) {
        ggml_backend_buffer_type_t buft = it.first;
        ggml_context * ctx = it.second;
-        std::vector<ggml_backend_buffer_t> bufs;
+        std::map<uint32_t, ggml_backend_buffer_t> bufs;

        // only the mmap region containing the tensors in the model is mapped to the backend buffer
        // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
@ -5145,12 +5148,11 @@ static bool llm_load_tensors(
                size_t first, last;
                ml.get_mapping_range(&first, &last, &addr, file_no, ctx);
                if (first >= last) {
-                    bufs.push_back(nullptr); // add a dummy buffer to keep the indices in sync
                    continue;
                }
                ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *)addr + first, last - first);
                if (buf != nullptr) {
-                    bufs.push_back(buf);
+                    bufs.emplace(file_no, buf);
 #ifdef GGML_USE_CUBLAS
                    if (n_layer >= n_gpu_layers) {
                        ggml_backend_cuda_register_host_buffer(
@ -5158,8 +5160,6 @@ static bool llm_load_tensors(
                            ggml_backend_buffer_get_size(buf));
                    }
 #endif
-                } else {
-                    throw std::runtime_error("failed to allocate cpu buffer");
                }
            }
        }
@ -5176,9 +5176,7 @@ static bool llm_load_tensors(
                }
                ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
                if (buf != nullptr) {
-                    bufs.push_back(buf);
-                } else {
-                    throw std::runtime_error("failed to allocate metal buffer");
+                    bufs.emplace(file_no, buf);
                }
            }
        }
@ -5192,9 +5190,9 @@ static bool llm_load_tensors(
                    mlock_buf->init(ggml_backend_buffer_get_base(buf));
                    mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
                }
-                bufs.push_back(buf);
-            } else {
-                throw std::runtime_error("failed to allocate backend buffer");
+                for (uint32_t file_no = 0; file_no < ml.files.size(); file_no++) {
+                    bufs.emplace(file_no, buf);
+                }
            }
        }
        if (bufs.empty()) {
@ -5202,12 +5200,9 @@ static bool llm_load_tensors(
        }
        // indicate that this buffer contains weights
        // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
-        for (ggml_backend_buffer_t buf : bufs) {
-            if (buf == nullptr) {
-                continue;
-            }
-            ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-            model.bufs.push_back(buf);
+        for (auto & buf : bufs) {
+            ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+            model.bufs.push_back(buf.second);
        }

        ctx_bufs.emplace_back(ctx, bufs);