diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 02689189c..dc3182494 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -117,6 +117,9 @@ vk_pipeline vk_pipeline_dequant_mul_mat_vec_q4_0; vk_pipeline vk_pipeline_mul_f32; vk_pipeline vk_pipeline_f16_to_f32, vk_pipeline_dequant_q4_0; +void * vk_pinned_workspace; +size_t vk_pinned_workspace_size; + bool vk_fp16_support = false; static std::vector> vk_pinned_memory; @@ -617,6 +620,9 @@ void ggml_vk_init(void) { device_create_info.setPNext(&device_features2); vk_device = vk_physical_device.createDevice(device_create_info); + vk_pinned_workspace = nullptr; + vk_pinned_workspace_size = 0; + // Prepare matmul values auto warptile_l = { 128, 128, 128, 16, 64, 64, 2, 4, 4 }; auto warptile_m = { 128, 64, 64, 16, 32, 32, 2, 4, 2 }; @@ -1532,7 +1538,18 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr const bool load_x = src0->backend != GGML_BACKEND_GPU; - ggml_fp16_t * fp16_staging = (ggml_fp16_t *) ggml_vk_host_malloc(sizeof(ggml_fp16_t) * (ne11 * ne10) * (ne02 * ne03)); + const size_t workspace_size = sizeof(ggml_fp16_t) * (ne11 * ne10) * (ne02 * ne03); + + if (vk_pinned_workspace == nullptr) { + vk_pinned_workspace = ggml_vk_host_malloc(workspace_size); + vk_pinned_workspace_size = workspace_size; + } else if (vk_pinned_workspace_size < workspace_size) { + ggml_vk_host_free(vk_pinned_workspace); + vk_pinned_workspace = ggml_vk_host_malloc(workspace_size); + vk_pinned_workspace_size = workspace_size; + } + + ggml_fp16_t * fp16_staging = (ggml_fp16_t *) vk_pinned_workspace; for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { @@ -1618,8 +1635,6 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr ggml_vk_queue_cleanup(vk_transfer_queues[1]); ggml_vk_queue_cleanup(vk_compute_queue); - ggml_vk_host_free(fp16_staging); - if (src0->backend != GGML_BACKEND_GPU) { ggml_vk_pool_free(d_X); }