Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible

2024-01-16 21:30:14 +01:00 · 2024-01-16 21:30:14 +01:00 · c3290d29e0
commit c3290d29e0
parent 542ae3b44c
3 changed files with 298 additions and 387 deletions
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@ -11,7 +11,7 @@ GGML_API void ggml_vk_init(void);

 GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node, struct ggml_cgraph * graph);
 GGML_API void ggml_vk_preallocate_buffers(void);
-GGML_API void ggml_vk_build_graph(struct ggml_tensor * node, struct ggml_cgraph * graph);
+GGML_API void ggml_vk_build_graph(struct ggml_tensor * node);
 GGML_API bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
 #ifdef GGML_VULKAN_CHECK_RESULTS
 void ggml_vk_check_results_0(struct ggml_compute_params * params, struct ggml_tensor * tensor);
--- a/llama.cpp
+++ b/llama.cpp
@ -6803,7 +6803,7 @@ static int llama_decode_internal(
    ggml_vk_preallocate_buffers();

    for (int i = 0; i < gf->n_nodes; i++) {
-        ggml_vk_build_graph(gf->nodes[i], gf);
+        ggml_vk_build_graph(gf->nodes[i]);
    }

    // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed