Parse graph early to pre-record command buffers

2023-09-29 17:08:09 +02:00 · 2023-09-29 17:08:09 +02:00 · 7f89e40e52
commit 7f89e40e52
parent 5ae5d2bd5b
5 changed files with 491 additions and 332 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -75,6 +75,7 @@ set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA k
 option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some calculations"   OFF)
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
+option(LLAMA_VULKAN                          "llama: use Vulkan"                                OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
@ -355,12 +356,14 @@ if (LLAMA_CLBLAST)
 endif()

 if (LLAMA_VULKAN)
-    find_package(Vulkan COMPONENTS glslc SPIRV-Tools)
+    find_package(Vulkan COMPONENTS shaderc_combined)
+    find_package(glslang)
+    find_package(SPIRV-Tools-opt)
    if (Vulkan_FOUND)
        message(STATUS "Vulkan found")

        add_library(ggml-vulkan STATIC ggml-vulkan.cpp ggml-vulkan.h)
-        target_link_libraries(ggml-vulkan PUBLIC Vulkan::Vulkan SPIRV SPIRV-Tools-opt SPIRV-Tools shaderc_combined)
+        target_link_libraries(ggml-vulkan PUBLIC Vulkan::Vulkan Vulkan::shaderc_combined)

        add_compile_definitions(GGML_USE_VULKAN)

--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@ -8,7 +8,11 @@ extern "C" {

 void ggml_vk_init(void);

+void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node);
+void ggml_vk_preallocate_buffers(void);
+void ggml_vk_build_graph(struct ggml_tensor * node);
 bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+void ggml_vk_graph_cleanup(void);

 void * ggml_vk_host_malloc(size_t size);
 void   ggml_vk_host_free(void * ptr);
--- a/ggml.c
+++ b/ggml.c
@ -14817,7 +14817,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
    GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
    GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
 #elif defined(GGML_USE_VULKAN)
-    bool skip_cpu = ggml_vk_compute_forward(params, tensor);
+    const bool skip_cpu = ggml_vk_compute_forward(params, tensor);
    if (skip_cpu) {
        return;
    }
--- a/llama.cpp
+++ b/llama.cpp
@ -110,7 +110,22 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
        plan.work_data = buf.data();
    }

+#ifdef GGML_USE_VULKAN
+    for (int i = 0; i < graph->n_nodes; i++) {
+        ggml_vk_preallocate_buffers_graph(graph->nodes[i]);
+    }
+    ggml_vk_preallocate_buffers();
+
+    for (int i = 0; i < graph->n_nodes; i++) {
+        ggml_vk_build_graph(graph->nodes[i]);
+    }
+#endif
+
    ggml_graph_compute(graph, &plan);
+
+#ifdef GGML_USE_VULKAN
+    ggml_vk_graph_cleanup();
+#endif
 }

 //