diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index c4ebc584b..3d9473e3f 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -9750,7 +9750,6 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
         }
 
         char * buf_host = (char *)data + offset_split;
-        //CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
         CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
     }
 }
diff --git a/llama.cpp b/llama.cpp
index 8b0ba7121..35de91b61 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2446,6 +2446,18 @@ struct llama_model_loader {
         }
     }
 
+    void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
+        GGML_ASSERT(mapping);
+
+        *first = mapping->size;
+        *last  = 0;
+        for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
+            const size_t offs = file_offset(ggml_get_name(tensor));
+            *first = std::min(*first, offs);
+            *last  = std::max(*last,  offs + ggml_nbytes(tensor));
+        }
+    }
+
     // for backwards compatibility, does not support ggml-backend
     void load_data_for(struct ggml_tensor * cur) const {
         const size_t offs = file_offset(ggml_get_name(cur));
@@ -3733,15 +3745,19 @@ static bool llm_load_tensors(
         ggml_context * ctx = it.second;
         ggml_backend_buffer_t buf = nullptr;
 
-        // TODO: do not use the whole model mapping for the buffer, only the region containing the tensors
-        // this is important for metal: if the entire model could be mapped to a metal buffer, then we could use metal for all layers
+        // only the region containing the tensors in the model is mapped to the backend buffer
+        // this is important for metal: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
         if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
-            buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size);
+            size_t first, last;
+            ml.get_mapping_range(&first, &last, ctx);
+            buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
         }
 #ifdef GGML_USE_METAL
         else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
             const size_t max_size = ggml_get_max_tensor_size(ctx);
-            buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size);
+            size_t first, last;
+            ml.get_mapping_range(&first, &last, ctx);
+            buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size);
         }
 #endif
         else {
@@ -5922,7 +5938,6 @@ static struct ggml_cgraph * llama_build_graph(
             alloc_inp_tokens = true;
         }
 
-        // FIXME: allocating this conditionally will result in issues with the measure allocator
         if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) {
             ggml_tallocr_alloc(lctx.alloc, cur);