Fix Vulkan no kv offload incoherence

2024-03-09 07:41:36 +01:00 · 2024-03-09 07:41:36 +01:00 · 492ad4b0e0
commit 492ad4b0e0
parent 6cdabe6526
3 changed files with 28 additions and 4 deletions
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@ -5080,6 +5080,9 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript
 // CPU assist interface

 void ggml_vk_init_cpu_assist() {
+#ifdef GGML_VULKAN_DEBUG
+    std::cerr << "ggml_vk_init_cpu_assist()" << std::endl;
+#endif
    ggml_vk_instance_init();

    std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
@ -5092,6 +5095,9 @@ void ggml_vk_init_cpu_assist() {
 }

 void ggml_vk_preallocate_buffers_graph_cpu_assist(ggml_tensor * node) {
+#ifdef GGML_VULKAN_DEBUG
+    std::cerr << "ggml_vk_preallocate_buffers_graph_cpu_assist()" << std::endl;
+#endif
    ggml_backend_vk_context * ctx = &vk_instance.contexts[0];

    if (!ctx->initialized) {
@ -5102,6 +5108,9 @@ void ggml_vk_preallocate_buffers_graph_cpu_assist(ggml_tensor * node) {
 }

 void ggml_vk_preallocate_buffers_cpu_assist() {
+#ifdef GGML_VULKAN_DEBUG
+    std::cerr << "ggml_vk_preallocate_buffers_cpu_assist()" << std::endl;
+#endif
    ggml_backend_vk_context * ctx = &vk_instance.contexts[0];

    if (!ctx->initialized) {
@ -5111,17 +5120,23 @@ void ggml_vk_preallocate_buffers_cpu_assist() {
    ggml_vk_preallocate_buffers(ctx);
 }

-void ggml_vk_build_graph_cpu_assist(ggml_tensor * node, bool last_node) {
+void ggml_vk_build_graph_cpu_assist(ggml_tensor * node) {
+#ifdef GGML_VULKAN_DEBUG
+    std::cerr << "ggml_vk_build_graph_cpu_assist()" << std::endl;
+#endif
    ggml_backend_vk_context * ctx = &vk_instance.contexts[0];

    if (!ctx->initialized) {
        return;
    }

-    ggml_vk_build_graph(ctx, node, last_node);
+    ggml_vk_build_graph(ctx, node, true);
 }

 bool ggml_vk_compute_forward_cpu_assist(ggml_compute_params * params, ggml_tensor * tensor){
+#ifdef GGML_VULKAN_DEBUG
+    std::cerr << "ggml_vk_compute_forward_cpu_assist()" << std::endl;
+#endif
    ggml_backend_vk_context * ctx = &vk_instance.contexts[0];

    if (!ctx->initialized) {
@ -5132,6 +5147,9 @@ bool ggml_vk_compute_forward_cpu_assist(ggml_compute_params * params, ggml_tenso
 }

 void ggml_vk_graph_cleanup_cpu_assist() {
+#ifdef GGML_VULKAN_DEBUG
+    std::cerr << "ggml_vk_graph_cleanup_cpu_assist()" << std::endl;
+#endif
    ggml_backend_vk_context * ctx = &vk_instance.contexts[0];

    if (!ctx->initialized) {
@ -5142,6 +5160,9 @@ void ggml_vk_graph_cleanup_cpu_assist() {
 }

 void ggml_vk_free_cpu_assist() {
+#ifdef GGML_VULKAN_DEBUG
+    std::cerr << "ggml_vk_init_cpu_assist()" << std::endl;
+#endif
    ggml_backend_vk_context * ctx = &vk_instance.contexts[0];

    if (!ctx->initialized || vk_instance.backends[0] == nullptr) {
@ -5574,6 +5595,9 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
 }

 GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+#ifdef GGML_VULKAN_DEBUG
+    std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
+#endif
    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;

    for (int i = 0; i < cgraph->n_nodes; i++) {
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@ -15,7 +15,7 @@ GGML_API void ggml_vk_init_cpu_assist(void);

 GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
 GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
-GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
+GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node);
 GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
 #ifdef GGML_VULKAN_CHECK_RESULTS
 void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
--- a/ggml.c
+++ b/ggml.c
@ -18044,7 +18044,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
    ggml_vk_preallocate_buffers_cpu_assist();

    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
+        ggml_vk_build_graph_cpu_assist(cgraph->nodes[i]);
    }
 #endif