Properly implement Vulkan backend buffer handling

2024-01-21 10:28:46 +01:00 · 2024-01-21 10:28:46 +01:00 · 6e6174206f
commit 6e6174206f
parent c0f3474ed5
4 changed files with 283 additions and 453 deletions
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@ -11,7 +11,7 @@ extern "C" {

 GGML_API void ggml_vk_init(void);

-GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node, struct ggml_cgraph * graph);
+GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node);
 GGML_API void ggml_vk_preallocate_buffers(void);
 GGML_API void ggml_vk_build_graph(struct ggml_tensor * node, bool last_node);
 GGML_API bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
@ -24,8 +24,6 @@ GGML_API void ggml_vk_graph_cleanup(void);
 GGML_API void * ggml_vk_host_malloc(size_t size);
 GGML_API void   ggml_vk_host_free(void * ptr);

-GGML_API void ggml_vk_free_data(const struct ggml_tensor * tensor);
-
 GGML_API void ggml_vk_transform_tensor_temporary(const void * data, struct ggml_tensor * tensor);
 GGML_API void ggml_vk_transform_tensor_static(const void * data, struct ggml_tensor * tensor);
 GGML_API void ggml_vk_assign_buffer(struct ggml_tensor * tensor);
--- a/ggml.c
+++ b/ggml.c
@ -16871,24 +16871,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
    }

 #ifdef GGML_USE_VULKAN
-    for (int i = 0; i < cgraph->n_leafs; i++) {
-        struct ggml_tensor * node = cgraph->leafs[i];
-        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
-            ggml_vk_transform_tensor_temporary(node->data, node);
-        }
-    }
-
    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_preallocate_buffers_graph(cgraph->nodes[i], cgraph);
+        ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]);
    }
    ggml_vk_preallocate_buffers();

    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
    }
-
-    // Set last tensor to CPU to force copy to CPU
-    cgraph->nodes[cgraph->n_nodes - 1]->backend = GGML_BACKEND_CPU;
 #endif

    const int n_threads = cplan->n_threads;
@ -16941,6 +16931,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
        }
    }

+#ifdef GGML_USE_VULKAN
+    ggml_vk_graph_cleanup();
+#endif
+
    // performance stats (graph)
    {
        int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_start_cycles;
--- a/llama.cpp
+++ b/llama.cpp
@ -9489,7 +9489,6 @@ struct llama_context * llama_new_context_with_model(
        }
 #elif defined(GGML_USE_VULKAN)
        if (model->n_gpu_layers > 0) {
-            // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
            ggml_backend_t backend = ggml_backend_vk_init();
            if (backend == nullptr) {
                LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);