Properly implement Vulkan backend buffer handling

2024-01-21 10:28:46 +01:00 · 2024-01-21 10:28:46 +01:00 · 6e6174206f
commit 6e6174206f
parent c0f3474ed5
4 changed files with 283 additions and 453 deletions
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@ -11,7 +11,7 @@ extern "C" {
 GGML_API void ggml_vk_init(void);
-GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node, struct ggml_cgraph * graph);
+GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node);
 GGML_API void ggml_vk_preallocate_buffers(void);
 GGML_API void ggml_vk_build_graph(struct ggml_tensor * node, bool last_node);
 GGML_API bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
@ -24,8 +24,6 @@ GGML_API void ggml_vk_graph_cleanup(void);
 GGML_API void * ggml_vk_host_malloc(size_t size);
 GGML_API void   ggml_vk_host_free(void * ptr);
 GGML_API void ggml_vk_free_data(const struct ggml_tensor * tensor);
 GGML_API void ggml_vk_transform_tensor_temporary(const void * data, struct ggml_tensor * tensor);
 GGML_API void ggml_vk_transform_tensor_static(const void * data, struct ggml_tensor * tensor);
 GGML_API void ggml_vk_assign_buffer(struct ggml_tensor * tensor);
--- a/ggml.c
+++ b/ggml.c
@ -16871,24 +16871,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
    }
 #ifdef GGML_USE_VULKAN
    for (int i = 0; i < cgraph->n_leafs; i++) {
        struct ggml_tensor * node = cgraph->leafs[i];
        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
            ggml_vk_transform_tensor_temporary(node->data, node);
        }
    }
    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_preallocate_buffers_graph(cgraph->nodes[i], cgraph);
+        ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]);
    }
    ggml_vk_preallocate_buffers();
    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
    }
    // Set last tensor to CPU to force copy to CPU
    cgraph->nodes[cgraph->n_nodes - 1]->backend = GGML_BACKEND_CPU;
 #endif
    const int n_threads = cplan->n_threads;
@ -16941,6 +16931,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
        }
    }
 #ifdef GGML_USE_VULKAN
    ggml_vk_graph_cleanup();
 #endif
    // performance stats (graph)
    {
        int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_start_cycles;
--- a/llama.cpp
+++ b/llama.cpp
@ -9489,7 +9489,6 @@ struct llama_context * llama_new_context_with_model(
        }
 #elif defined(GGML_USE_VULKAN)
        if (model->n_gpu_layers > 0) {
            // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
            ggml_backend_t backend = ggml_backend_vk_init();
            if (backend == nullptr) {
                LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);