Properly implement Vulkan backend buffer handling
This commit is contained in:
parent
c0f3474ed5
commit
6e6174206f
4 changed files with 283 additions and 453 deletions
715
ggml-vulkan.cpp
715
ggml-vulkan.cpp
File diff suppressed because it is too large
Load diff
|
@ -11,7 +11,7 @@ extern "C" {
|
||||||
|
|
||||||
GGML_API void ggml_vk_init(void);
|
GGML_API void ggml_vk_init(void);
|
||||||
|
|
||||||
GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node, struct ggml_cgraph * graph);
|
GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node);
|
||||||
GGML_API void ggml_vk_preallocate_buffers(void);
|
GGML_API void ggml_vk_preallocate_buffers(void);
|
||||||
GGML_API void ggml_vk_build_graph(struct ggml_tensor * node, bool last_node);
|
GGML_API void ggml_vk_build_graph(struct ggml_tensor * node, bool last_node);
|
||||||
GGML_API bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
GGML_API bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||||
|
@ -24,8 +24,6 @@ GGML_API void ggml_vk_graph_cleanup(void);
|
||||||
GGML_API void * ggml_vk_host_malloc(size_t size);
|
GGML_API void * ggml_vk_host_malloc(size_t size);
|
||||||
GGML_API void ggml_vk_host_free(void * ptr);
|
GGML_API void ggml_vk_host_free(void * ptr);
|
||||||
|
|
||||||
GGML_API void ggml_vk_free_data(const struct ggml_tensor * tensor);
|
|
||||||
|
|
||||||
GGML_API void ggml_vk_transform_tensor_temporary(const void * data, struct ggml_tensor * tensor);
|
GGML_API void ggml_vk_transform_tensor_temporary(const void * data, struct ggml_tensor * tensor);
|
||||||
GGML_API void ggml_vk_transform_tensor_static(const void * data, struct ggml_tensor * tensor);
|
GGML_API void ggml_vk_transform_tensor_static(const void * data, struct ggml_tensor * tensor);
|
||||||
GGML_API void ggml_vk_assign_buffer(struct ggml_tensor * tensor);
|
GGML_API void ggml_vk_assign_buffer(struct ggml_tensor * tensor);
|
||||||
|
|
16
ggml.c
16
ggml.c
|
@ -16871,24 +16871,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_VULKAN
|
#ifdef GGML_USE_VULKAN
|
||||||
for (int i = 0; i < cgraph->n_leafs; i++) {
|
|
||||||
struct ggml_tensor * node = cgraph->leafs[i];
|
|
||||||
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
|
|
||||||
ggml_vk_transform_tensor_temporary(node->data, node);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_vk_preallocate_buffers_graph(cgraph->nodes[i], cgraph);
|
ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]);
|
||||||
}
|
}
|
||||||
ggml_vk_preallocate_buffers();
|
ggml_vk_preallocate_buffers();
|
||||||
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set last tensor to CPU to force copy to CPU
|
|
||||||
cgraph->nodes[cgraph->n_nodes - 1]->backend = GGML_BACKEND_CPU;
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const int n_threads = cplan->n_threads;
|
const int n_threads = cplan->n_threads;
|
||||||
|
@ -16941,6 +16931,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_VULKAN
|
||||||
|
ggml_vk_graph_cleanup();
|
||||||
|
#endif
|
||||||
|
|
||||||
// performance stats (graph)
|
// performance stats (graph)
|
||||||
{
|
{
|
||||||
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
||||||
|
|
|
@ -9489,7 +9489,6 @@ struct llama_context * llama_new_context_with_model(
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
if (model->n_gpu_layers > 0) {
|
if (model->n_gpu_layers > 0) {
|
||||||
// with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
|
|
||||||
ggml_backend_t backend = ggml_backend_vk_init();
|
ggml_backend_t backend = ggml_backend_vk_init();
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue