Properly implement Vulkan backend buffer handling

This commit is contained in:
0cc4m 2024-01-21 10:28:46 +01:00
parent c0f3474ed5
commit 6e6174206f
4 changed files with 283 additions and 453 deletions

File diff suppressed because it is too large Load diff

View file

@ -11,7 +11,7 @@ extern "C" {
GGML_API void ggml_vk_init(void);
GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node, struct ggml_cgraph * graph);
GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node);
GGML_API void ggml_vk_preallocate_buffers(void);
GGML_API void ggml_vk_build_graph(struct ggml_tensor * node, bool last_node);
GGML_API bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
@ -24,8 +24,6 @@ GGML_API void ggml_vk_graph_cleanup(void);
GGML_API void * ggml_vk_host_malloc(size_t size);
GGML_API void ggml_vk_host_free(void * ptr);
GGML_API void ggml_vk_free_data(const struct ggml_tensor * tensor);
GGML_API void ggml_vk_transform_tensor_temporary(const void * data, struct ggml_tensor * tensor);
GGML_API void ggml_vk_transform_tensor_static(const void * data, struct ggml_tensor * tensor);
GGML_API void ggml_vk_assign_buffer(struct ggml_tensor * tensor);

16
ggml.c
View file

@ -16871,24 +16871,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
}
#ifdef GGML_USE_VULKAN
for (int i = 0; i < cgraph->n_leafs; i++) {
struct ggml_tensor * node = cgraph->leafs[i];
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
ggml_vk_transform_tensor_temporary(node->data, node);
}
}
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_preallocate_buffers_graph(cgraph->nodes[i], cgraph);
ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]);
}
ggml_vk_preallocate_buffers();
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
}
// Set last tensor to CPU to force copy to CPU
cgraph->nodes[cgraph->n_nodes - 1]->backend = GGML_BACKEND_CPU;
#endif
const int n_threads = cplan->n_threads;
@ -16941,6 +16931,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
}
}
#ifdef GGML_USE_VULKAN
ggml_vk_graph_cleanup();
#endif
// performance stats (graph)
{
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;

View file

@ -9489,7 +9489,6 @@ struct llama_context * llama_new_context_with_model(
}
#elif defined(GGML_USE_VULKAN)
if (model->n_gpu_layers > 0) {
// with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
ggml_backend_t backend = ggml_backend_vk_init();
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);