Vulkan k-quant mmq and ggml-backend offload functionality (#6155)

* Fix Vulkan no kv offload incoherence

* Add k-quant mul mat mat shaders

* Rework working buffer allocation, reduces vram use noticeably

Clean up cpu assist code, replaced with ggml-backend offload function

* Default to all dedicated GPUs

* Add fallback for integrated GPUs if no dedicated GPUs are found

* Add debug info which device is allocating memory

* Fix Intel dequant issue

Fix validation issue

* Fix Vulkan GGML_OP_GET_ROWS implementation

* Clean up merge artifacts

* Remove Vulkan warning
This commit is contained in:
0cc4m 2024-03-29 17:29:21 +01:00 committed by GitHub
parent d48ccf3ad4
commit ba0c7c70ab
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 25616 additions and 3192 deletions

View file

@ -11,17 +11,6 @@ extern "C" {
#define GGML_VK_MAX_DEVICES 16
GGML_API void ggml_vk_instance_init(void);
GGML_API void ggml_vk_init_cpu_assist(void);
GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
#ifdef GGML_VULKAN_CHECK_RESULTS
void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
#endif
GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
GGML_API void ggml_vk_free_cpu_assist(void);
// backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);