llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

2024-01-08 18:17:18 +01:00 · 2024-01-08 18:17:18 +01:00 · 4ed5f621be
commit 4ed5f621be
parent 11583c1462
2 changed files with 20 additions and 6 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -9750,7 +9750,6 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
        }

        char * buf_host = (char *)data + offset_split;
-        //CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
        CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
    }
 }
--- a/llama.cpp
+++ b/llama.cpp
@ -2446,6 +2446,18 @@ struct llama_model_loader {
        }
    }

+    void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
+        GGML_ASSERT(mapping);
+
+        *first = mapping->size;
+        *last  = 0;
+        for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
+            const size_t offs = file_offset(ggml_get_name(tensor));
+            *first = std::min(*first, offs);
+            *last  = std::max(*last,  offs + ggml_nbytes(tensor));
+        }
+    }
+
    // for backwards compatibility, does not support ggml-backend
    void load_data_for(struct ggml_tensor * cur) const {
        const size_t offs = file_offset(ggml_get_name(cur));
@ -3733,15 +3745,19 @@ static bool llm_load_tensors(
        ggml_context * ctx = it.second;
        ggml_backend_buffer_t buf = nullptr;

-        // TODO: do not use the whole model mapping for the buffer, only the region containing the tensors
-        // this is important for metal: if the entire model could be mapped to a metal buffer, then we could use metal for all layers
+        // only the region containing the tensors in the model is mapped to the backend buffer
+        // this is important for metal: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
        if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
-            buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size);
+            size_t first, last;
+            ml.get_mapping_range(&first, &last, ctx);
+            buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
        }
 #ifdef GGML_USE_METAL
        else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
            const size_t max_size = ggml_get_max_tensor_size(ctx);
-            buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size);
+            size_t first, last;
+            ml.get_mapping_range(&first, &last, ctx);
+            buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size);
        }
 #endif
        else {
@ -5922,7 +5938,6 @@ static struct ggml_cgraph * llama_build_graph(
            alloc_inp_tokens = true;
        }

-        // FIXME: allocating this conditionally will result in issues with the measure allocator
        if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) {
            ggml_tallocr_alloc(lctx.alloc, cur);