From 59ee00a8801953cdd3872c2d567164884075bc67 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Mon, 7 Oct 2024 20:42:52 +0200
Subject: [PATCH] fix mmap usage when using host buffers

---
 src/llama.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 0d7b97ada..3fb8132f0 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8903,7 +8903,8 @@ static bool llm_load_tensors(
         bufs.reserve(n_max_backend_buffer);
 
         // check if this backend device supports buffer_from_host_ptr
-        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
+        // when using a host buffer as the CPU bakcend buffer, use the CPU device to prioritize using buffer_from_host_ptr over the host buffer
+        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft == llama_default_buffer_type_cpu(model, true) ? ggml_backend_cpu_buffer_type() : buft);
         bool buffer_from_host_ptr_supported = false;
         if (dev) {
             ggml_backend_dev_props props;