From 492ad4b0e09a4672bff124fd8534bed618852aeb Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sat, 9 Mar 2024 07:41:36 +0100
Subject: [PATCH] Fix Vulkan no kv offload incoherence

---
 ggml-vulkan.cpp | 28 ++++++++++++++++++++++++++--
 ggml-vulkan.h   |  2 +-
 ggml.c          |  2 +-
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 5a1b3f477..027cba2fc 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -5080,6 +5080,9 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript
 // CPU assist interface
 
 void ggml_vk_init_cpu_assist() {
+#ifdef GGML_VULKAN_DEBUG
+    std::cerr << "ggml_vk_init_cpu_assist()" << std::endl;
+#endif
     ggml_vk_instance_init();
 
     std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
@@ -5092,6 +5095,9 @@ void ggml_vk_init_cpu_assist() {
 }
 
 void ggml_vk_preallocate_buffers_graph_cpu_assist(ggml_tensor * node) {
+#ifdef GGML_VULKAN_DEBUG
+    std::cerr << "ggml_vk_preallocate_buffers_graph_cpu_assist()" << std::endl;
+#endif
     ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
 
     if (!ctx->initialized) {
@@ -5102,6 +5108,9 @@ void ggml_vk_preallocate_buffers_graph_cpu_assist(ggml_tensor * node) {
 }
 
 void ggml_vk_preallocate_buffers_cpu_assist() {
+#ifdef GGML_VULKAN_DEBUG
+    std::cerr << "ggml_vk_preallocate_buffers_cpu_assist()" << std::endl;
+#endif
     ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
 
     if (!ctx->initialized) {
@@ -5111,17 +5120,23 @@ void ggml_vk_preallocate_buffers_cpu_assist() {
     ggml_vk_preallocate_buffers(ctx);
 }
 
-void ggml_vk_build_graph_cpu_assist(ggml_tensor * node, bool last_node) {
+void ggml_vk_build_graph_cpu_assist(ggml_tensor * node) {
+#ifdef GGML_VULKAN_DEBUG
+    std::cerr << "ggml_vk_build_graph_cpu_assist()" << std::endl;
+#endif
     ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
 
     if (!ctx->initialized) {
         return;
     }
 
-    ggml_vk_build_graph(ctx, node, last_node);
+    ggml_vk_build_graph(ctx, node, true);
 }
 
 bool ggml_vk_compute_forward_cpu_assist(ggml_compute_params * params, ggml_tensor * tensor){
+#ifdef GGML_VULKAN_DEBUG
+    std::cerr << "ggml_vk_compute_forward_cpu_assist()" << std::endl;
+#endif
     ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
 
     if (!ctx->initialized) {
@@ -5132,6 +5147,9 @@ bool ggml_vk_compute_forward_cpu_assist(ggml_compute_params * params, ggml_tenso
 }
 
 void ggml_vk_graph_cleanup_cpu_assist() {
+#ifdef GGML_VULKAN_DEBUG
+    std::cerr << "ggml_vk_graph_cleanup_cpu_assist()" << std::endl;
+#endif
     ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
 
     if (!ctx->initialized) {
@@ -5142,6 +5160,9 @@ void ggml_vk_graph_cleanup_cpu_assist() {
 }
 
 void ggml_vk_free_cpu_assist() {
+#ifdef GGML_VULKAN_DEBUG
+    std::cerr << "ggml_vk_init_cpu_assist()" << std::endl;
+#endif
     ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
 
     if (!ctx->initialized || vk_instance.backends[0] == nullptr) {
@@ -5574,6 +5595,9 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
 }
 
 GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+#ifdef GGML_VULKAN_DEBUG
+    std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
+#endif
     ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
index e4317c3e0..cec2bbd7f 100644
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -15,7 +15,7 @@ GGML_API void ggml_vk_init_cpu_assist(void);
 
 GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
 GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
-GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
+GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node);
 GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
 #ifdef GGML_VULKAN_CHECK_RESULTS
 void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
diff --git a/ggml.c b/ggml.c
index 92b17ee6e..01c6846b4 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18044,7 +18044,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     ggml_vk_preallocate_buffers_cpu_assist();
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
+        ggml_vk_build_graph_cpu_assist(cgraph->nodes[i]);
     }
 #endif