Reuse pinned allocation for f16 conversion
This commit is contained in:
parent
f2d4ca34bf
commit
67843a3812
1 changed files with 18 additions and 3 deletions
|
@ -117,6 +117,9 @@ vk_pipeline vk_pipeline_dequant_mul_mat_vec_q4_0;
|
|||
vk_pipeline vk_pipeline_mul_f32;
|
||||
vk_pipeline vk_pipeline_f16_to_f32, vk_pipeline_dequant_q4_0;
|
||||
|
||||
void * vk_pinned_workspace;
|
||||
size_t vk_pinned_workspace_size;
|
||||
|
||||
bool vk_fp16_support = false;
|
||||
|
||||
static std::vector<std::tuple<void*, size_t, vk_buffer>> vk_pinned_memory;
|
||||
|
@ -617,6 +620,9 @@ void ggml_vk_init(void) {
|
|||
device_create_info.setPNext(&device_features2);
|
||||
vk_device = vk_physical_device.createDevice(device_create_info);
|
||||
|
||||
vk_pinned_workspace = nullptr;
|
||||
vk_pinned_workspace_size = 0;
|
||||
|
||||
// Prepare matmul values
|
||||
auto warptile_l = { 128, 128, 128, 16, 64, 64, 2, 4, 4 };
|
||||
auto warptile_m = { 128, 64, 64, 16, 32, 32, 2, 4, 2 };
|
||||
|
@ -1532,7 +1538,18 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
|
||||
const bool load_x = src0->backend != GGML_BACKEND_GPU;
|
||||
|
||||
ggml_fp16_t * fp16_staging = (ggml_fp16_t *) ggml_vk_host_malloc(sizeof(ggml_fp16_t) * (ne11 * ne10) * (ne02 * ne03));
|
||||
const size_t workspace_size = sizeof(ggml_fp16_t) * (ne11 * ne10) * (ne02 * ne03);
|
||||
|
||||
if (vk_pinned_workspace == nullptr) {
|
||||
vk_pinned_workspace = ggml_vk_host_malloc(workspace_size);
|
||||
vk_pinned_workspace_size = workspace_size;
|
||||
} else if (vk_pinned_workspace_size < workspace_size) {
|
||||
ggml_vk_host_free(vk_pinned_workspace);
|
||||
vk_pinned_workspace = ggml_vk_host_malloc(workspace_size);
|
||||
vk_pinned_workspace_size = workspace_size;
|
||||
}
|
||||
|
||||
ggml_fp16_t * fp16_staging = (ggml_fp16_t *) vk_pinned_workspace;
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
|
@ -1618,8 +1635,6 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
ggml_vk_queue_cleanup(vk_transfer_queues[1]);
|
||||
ggml_vk_queue_cleanup(vk_compute_queue);
|
||||
|
||||
ggml_vk_host_free(fp16_staging);
|
||||
|
||||
if (src0->backend != GGML_BACKEND_GPU) {
|
||||
ggml_vk_pool_free(d_X);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue