Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible

This commit is contained in:
0cc4m 2024-01-16 21:30:14 +01:00
parent 542ae3b44c
commit c3290d29e0
3 changed files with 298 additions and 387 deletions

File diff suppressed because it is too large Load diff

View file

@ -11,7 +11,7 @@ GGML_API void ggml_vk_init(void);
GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node, struct ggml_cgraph * graph);
GGML_API void ggml_vk_preallocate_buffers(void);
GGML_API void ggml_vk_build_graph(struct ggml_tensor * node, struct ggml_cgraph * graph);
GGML_API void ggml_vk_build_graph(struct ggml_tensor * node);
GGML_API bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
#ifdef GGML_VULKAN_CHECK_RESULTS
void ggml_vk_check_results_0(struct ggml_compute_params * params, struct ggml_tensor * tensor);

View file

@ -6803,7 +6803,7 @@ static int llama_decode_internal(
ggml_vk_preallocate_buffers();
for (int i = 0; i < gf->n_nodes; i++) {
ggml_vk_build_graph(gf->nodes[i], gf);
ggml_vk_build_graph(gf->nodes[i]);
}
// HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed