Reuse pinned allocation for f16 conversion

This commit is contained in:
0cc4m 2023-07-22 18:48:15 +02:00
parent f2d4ca34bf
commit 67843a3812

View file

@ -117,6 +117,9 @@ vk_pipeline vk_pipeline_dequant_mul_mat_vec_q4_0;
vk_pipeline vk_pipeline_mul_f32;
vk_pipeline vk_pipeline_f16_to_f32, vk_pipeline_dequant_q4_0;
void * vk_pinned_workspace;
size_t vk_pinned_workspace_size;
bool vk_fp16_support = false;
static std::vector<std::tuple<void*, size_t, vk_buffer>> vk_pinned_memory;
@ -617,6 +620,9 @@ void ggml_vk_init(void) {
device_create_info.setPNext(&device_features2);
vk_device = vk_physical_device.createDevice(device_create_info);
vk_pinned_workspace = nullptr;
vk_pinned_workspace_size = 0;
// Prepare matmul values
auto warptile_l = { 128, 128, 128, 16, 64, 64, 2, 4, 4 };
auto warptile_m = { 128, 64, 64, 16, 32, 32, 2, 4, 2 };
@ -1532,7 +1538,18 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
const bool load_x = src0->backend != GGML_BACKEND_GPU;
ggml_fp16_t * fp16_staging = (ggml_fp16_t *) ggml_vk_host_malloc(sizeof(ggml_fp16_t) * (ne11 * ne10) * (ne02 * ne03));
const size_t workspace_size = sizeof(ggml_fp16_t) * (ne11 * ne10) * (ne02 * ne03);
if (vk_pinned_workspace == nullptr) {
vk_pinned_workspace = ggml_vk_host_malloc(workspace_size);
vk_pinned_workspace_size = workspace_size;
} else if (vk_pinned_workspace_size < workspace_size) {
ggml_vk_host_free(vk_pinned_workspace);
vk_pinned_workspace = ggml_vk_host_malloc(workspace_size);
vk_pinned_workspace_size = workspace_size;
}
ggml_fp16_t * fp16_staging = (ggml_fp16_t *) vk_pinned_workspace;
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
@ -1618,8 +1635,6 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
ggml_vk_queue_cleanup(vk_transfer_queues[1]);
ggml_vk_queue_cleanup(vk_compute_queue);
ggml_vk_host_free(fp16_staging);
if (src0->backend != GGML_BACKEND_GPU) {
ggml_vk_pool_free(d_X);
}