Fix Vulkan on Intel ARC
Optimize matmul for Intel ARC Add Vulkan dequant test
This commit is contained in:
parent
191221178f
commit
4e9091f11e
3 changed files with 1118 additions and 10224 deletions
10922
ggml-vulkan-shaders.hpp
10922
ggml-vulkan-shaders.hpp
File diff suppressed because it is too large
Load diff
206
ggml-vulkan.cpp
206
ggml-vulkan.cpp
|
@ -849,36 +849,6 @@ static void ggml_vk_load_shaders() {
|
||||||
vk_pipeline_matmul_f16_f32_aligned_l = ggml_vk_create_pipeline("matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_l_len, matmul_f16_f32_aligned_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
|
vk_pipeline_matmul_f16_f32_aligned_l = ggml_vk_create_pipeline("matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_l_len, matmul_f16_f32_aligned_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
|
||||||
vk_pipeline_matmul_f16_f32_aligned_m = ggml_vk_create_pipeline("matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_m_len, matmul_f16_f32_aligned_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
|
vk_pipeline_matmul_f16_f32_aligned_m = ggml_vk_create_pipeline("matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_m_len, matmul_f16_f32_aligned_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
|
||||||
vk_pipeline_matmul_f16_f32_aligned_s = ggml_vk_create_pipeline("matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_s_len, matmul_f16_f32_aligned_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
|
vk_pipeline_matmul_f16_f32_aligned_s = ggml_vk_create_pipeline("matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_s_len, matmul_f16_f32_aligned_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
|
||||||
|
|
||||||
// Build dequant shaders
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_F32] = ggml_vk_create_pipeline("f32_to_f16", f32_to_f16_len, f32_to_f16_data, "main", 2, 4 * sizeof(int), {64, 1, 1}, {}, 1);
|
|
||||||
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_F16] = ggml_vk_create_pipeline("dequant_f16", dequant_f16_len, dequant_f16_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_Q2_K] = ggml_vk_create_pipeline("dequant_q2_K", dequant_q2_K_len, dequant_q2_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_Q3_K] = ggml_vk_create_pipeline("dequant_q3_K", dequant_q3_K_len, dequant_q3_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_Q4_K] = ggml_vk_create_pipeline("dequant_q4_K", dequant_q4_K_len, dequant_q4_K_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_Q5_K] = ggml_vk_create_pipeline("dequant_q5_K", dequant_q5_K_len, dequant_q5_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_Q6_K] = ggml_vk_create_pipeline("dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1);
|
|
||||||
|
|
||||||
// get_rows
|
|
||||||
vk_pipeline_get_rows[GGML_TYPE_F16] = ggml_vk_create_pipeline("get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_get_rows[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_get_rows[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_get_rows[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_get_rows[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_get_rows[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
|
|
||||||
vk_pipeline_get_rows_f32[GGML_TYPE_F16] = ggml_vk_create_pipeline("get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_get_rows_f32[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_get_rows_f32[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_get_rows_f32[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_get_rows_f32[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_get_rows_f32[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
} else {
|
} else {
|
||||||
vk_pipeline_matmul_f32_l = ggml_vk_create_pipeline("matmul_f32_l", matmul_f32_l_fp32_len, matmul_f32_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
|
vk_pipeline_matmul_f32_l = ggml_vk_create_pipeline("matmul_f32_l", matmul_f32_l_fp32_len, matmul_f32_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
|
||||||
vk_pipeline_matmul_f32_m = ggml_vk_create_pipeline("matmul_f32_m", matmul_f32_m_fp32_len, matmul_f32_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
|
vk_pipeline_matmul_f32_m = ggml_vk_create_pipeline("matmul_f32_m", matmul_f32_m_fp32_len, matmul_f32_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
|
||||||
|
@ -901,36 +871,6 @@ static void ggml_vk_load_shaders() {
|
||||||
vk_pipeline_matmul_f16_f32_aligned_l = ggml_vk_create_pipeline("matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_l_fp32_len, matmul_f16_f32_aligned_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
|
vk_pipeline_matmul_f16_f32_aligned_l = ggml_vk_create_pipeline("matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_l_fp32_len, matmul_f16_f32_aligned_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
|
||||||
vk_pipeline_matmul_f16_f32_aligned_m = ggml_vk_create_pipeline("matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_m_fp32_len, matmul_f16_f32_aligned_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
|
vk_pipeline_matmul_f16_f32_aligned_m = ggml_vk_create_pipeline("matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_m_fp32_len, matmul_f16_f32_aligned_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
|
||||||
vk_pipeline_matmul_f16_f32_aligned_s = ggml_vk_create_pipeline("matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_s_fp32_len, matmul_f16_f32_aligned_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
|
vk_pipeline_matmul_f16_f32_aligned_s = ggml_vk_create_pipeline("matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_s_fp32_len, matmul_f16_f32_aligned_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
|
||||||
|
|
||||||
// Build dequant shaders
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_F32] = ggml_vk_create_pipeline("f32_to_f16", f32_to_f16_fp32_len, f32_to_f16_fp32_data, "main", 2, 4 * sizeof(int), {64, 1, 1}, {}, 1);
|
|
||||||
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_F16] = ggml_vk_create_pipeline("dequant_f16", dequant_f16_fp32_len, dequant_f16_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("dequant_q4_0", dequant_q4_0_fp32_len, dequant_q4_0_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("dequant_q4_1", dequant_q4_1_fp32_len, dequant_q4_1_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("dequant_q5_0", dequant_q5_0_fp32_len, dequant_q5_0_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("dequant_q5_1", dequant_q5_1_fp32_len, dequant_q5_1_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("dequant_q8_0", dequant_q8_0_fp32_len, dequant_q8_0_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_Q2_K] = ggml_vk_create_pipeline("dequant_q2_K", dequant_q2_K_fp32_len, dequant_q2_K_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_Q3_K] = ggml_vk_create_pipeline("dequant_q3_K", dequant_q3_K_fp32_len, dequant_q3_K_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_Q4_K] = ggml_vk_create_pipeline("dequant_q4_K", dequant_q4_K_fp32_len, dequant_q4_K_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_Q5_K] = ggml_vk_create_pipeline("dequant_q5_K", dequant_q5_K_fp32_len, dequant_q5_K_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_dequant[GGML_TYPE_Q6_K] = ggml_vk_create_pipeline("dequant_q6_K", dequant_q6_K_fp32_len, dequant_q6_K_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
|
||||||
|
|
||||||
// get_rows
|
|
||||||
vk_pipeline_get_rows[GGML_TYPE_F16] = ggml_vk_create_pipeline("get_rows_f16", get_rows_f16_fp32_len, get_rows_f16_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_get_rows[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("get_rows_q4_0", get_rows_q4_0_fp32_len, get_rows_q4_0_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_get_rows[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("get_rows_q4_1", get_rows_q4_1_fp32_len, get_rows_q4_1_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_get_rows[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("get_rows_q5_0", get_rows_q5_0_fp32_len, get_rows_q5_0_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_get_rows[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("get_rows_q5_1", get_rows_q5_1_fp32_len, get_rows_q5_1_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_get_rows[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("get_rows_q8_0", get_rows_q8_0_fp32_len, get_rows_q8_0_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
|
|
||||||
vk_pipeline_get_rows_f32[GGML_TYPE_F16] = ggml_vk_create_pipeline("get_rows_f16_f32", get_rows_f16_f32_fp32_len, get_rows_f16_f32_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_get_rows_f32[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("get_rows_q4_0_f32", get_rows_q4_0_f32_fp32_len, get_rows_q4_0_f32_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_get_rows_f32[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("get_rows_q4_1_f32", get_rows_q4_1_f32_fp32_len, get_rows_q4_1_f32_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_get_rows_f32[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("get_rows_q5_0_f32", get_rows_q5_0_f32_fp32_len, get_rows_q5_0_f32_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_get_rows_f32[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("get_rows_q5_1_f32", get_rows_q5_1_f32_fp32_len, get_rows_q5_1_f32_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
vk_pipeline_get_rows_f32[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("get_rows_q8_0_f32", get_rows_q8_0_f32_fp32_len, get_rows_q8_0_f32_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16] = ggml_vk_create_pipeline("mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1);
|
vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16] = ggml_vk_create_pipeline("mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1);
|
||||||
|
@ -945,6 +885,36 @@ static void ggml_vk_load_shaders() {
|
||||||
vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_K] = ggml_vk_create_pipeline("mul_mat_vec_q5_K_f32", mul_mat_vec_q5_K_f32_len, mul_mat_vec_q5_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1);
|
vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_K] = ggml_vk_create_pipeline("mul_mat_vec_q5_K_f32", mul_mat_vec_q5_K_f32_len, mul_mat_vec_q5_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1);
|
||||||
vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q6_K] = ggml_vk_create_pipeline("mul_mat_vec_q6_K_f32", mul_mat_vec_q6_K_f32_len, mul_mat_vec_q6_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1);
|
vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q6_K] = ggml_vk_create_pipeline("mul_mat_vec_q6_K_f32", mul_mat_vec_q6_K_f32_len, mul_mat_vec_q6_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1);
|
||||||
|
|
||||||
|
// dequant shaders
|
||||||
|
vk_pipeline_dequant[GGML_TYPE_F32] = ggml_vk_create_pipeline("f32_to_f16", f32_to_f16_len, f32_to_f16_data, "main", 2, 4 * sizeof(int), {64, 1, 1}, {}, 1);
|
||||||
|
|
||||||
|
vk_pipeline_dequant[GGML_TYPE_F16] = ggml_vk_create_pipeline("dequant_f16", dequant_f16_len, dequant_f16_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
||||||
|
vk_pipeline_dequant[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
||||||
|
vk_pipeline_dequant[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
||||||
|
vk_pipeline_dequant[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
||||||
|
vk_pipeline_dequant[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
||||||
|
vk_pipeline_dequant[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
||||||
|
vk_pipeline_dequant[GGML_TYPE_Q2_K] = ggml_vk_create_pipeline("dequant_q2_K", dequant_q2_K_len, dequant_q2_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1);
|
||||||
|
vk_pipeline_dequant[GGML_TYPE_Q3_K] = ggml_vk_create_pipeline("dequant_q3_K", dequant_q3_K_len, dequant_q3_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1);
|
||||||
|
vk_pipeline_dequant[GGML_TYPE_Q4_K] = ggml_vk_create_pipeline("dequant_q4_K", dequant_q4_K_len, dequant_q4_K_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
|
||||||
|
vk_pipeline_dequant[GGML_TYPE_Q5_K] = ggml_vk_create_pipeline("dequant_q5_K", dequant_q5_K_len, dequant_q5_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1);
|
||||||
|
vk_pipeline_dequant[GGML_TYPE_Q6_K] = ggml_vk_create_pipeline("dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1);
|
||||||
|
|
||||||
|
// get_rows
|
||||||
|
vk_pipeline_get_rows[GGML_TYPE_F16] = ggml_vk_create_pipeline("get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
vk_pipeline_get_rows[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
vk_pipeline_get_rows[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
vk_pipeline_get_rows[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
vk_pipeline_get_rows[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
vk_pipeline_get_rows[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
||||||
|
vk_pipeline_get_rows_f32[GGML_TYPE_F16] = ggml_vk_create_pipeline("get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
vk_pipeline_get_rows_f32[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
vk_pipeline_get_rows_f32[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
vk_pipeline_get_rows_f32[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
vk_pipeline_get_rows_f32[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
vk_pipeline_get_rows_f32[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
||||||
vk_pipeline_matmul_split_k_reduce = ggml_vk_create_pipeline("split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
|
vk_pipeline_matmul_split_k_reduce = ggml_vk_create_pipeline("split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
|
||||||
|
|
||||||
vk_pipeline_mul_mat_vec_p021_f16_f32 = ggml_vk_create_pipeline("mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
|
vk_pipeline_mul_mat_vec_p021_f16_f32 = ggml_vk_create_pipeline("mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
|
||||||
|
@ -1875,7 +1845,7 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(bool bit16_x, bool bit16_y, in
|
||||||
std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
|
std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
|
||||||
#endif
|
#endif
|
||||||
if (bit16_x && bit16_y) {
|
if (bit16_x && bit16_y) {
|
||||||
if (m <= 32 || n <= 32) {
|
if (vk_device.vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
|
||||||
#ifdef VK_DEBUG
|
#ifdef VK_DEBUG
|
||||||
std::cerr << " S" << std::endl;
|
std::cerr << " S" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
@ -1893,7 +1863,7 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(bool bit16_x, bool bit16_y, in
|
||||||
return aligned ? &vk_pipeline_matmul_f16_aligned_l : &vk_pipeline_matmul_f16_l;
|
return aligned ? &vk_pipeline_matmul_f16_aligned_l : &vk_pipeline_matmul_f16_l;
|
||||||
}
|
}
|
||||||
if (bit16_x && !bit16_y) {
|
if (bit16_x && !bit16_y) {
|
||||||
if (m <= 32 || n <= 32) {
|
if (vk_device.vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
|
||||||
#ifdef VK_DEBUG
|
#ifdef VK_DEBUG
|
||||||
std::cerr << " S" << std::endl;
|
std::cerr << " S" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
@ -1914,7 +1884,7 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(bool bit16_x, bool bit16_y, in
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (m <= 32 || n <= 32) {
|
if (vk_device.vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
|
||||||
#ifdef VK_DEBUG
|
#ifdef VK_DEBUG
|
||||||
std::cerr << " S" << std::endl;
|
std::cerr << " S" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
@ -1936,8 +1906,8 @@ static void ggml_vk_matmul(vk_context * ctx, vk_pipeline& pipeline, vk_subbuffer
|
||||||
#ifdef VK_DEBUG
|
#ifdef VK_DEBUG
|
||||||
std::cerr << "ggml_vk_matmul(a: (" << a.buffer.buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer.buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer.buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << split_k_buffer.buffer.buffer << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
|
std::cerr << "ggml_vk_matmul(a: (" << a.buffer.buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer.buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer.buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << split_k_buffer.buffer.buffer << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
ggml_vk_sync_buffers(ctx);
|
||||||
if (split_k == 1) {
|
if (split_k == 1) {
|
||||||
ggml_vk_sync_buffers(ctx);
|
|
||||||
const std::array<uint32_t, 14> pc = { m, n, k, stride_a, stride_b, stride_d, k, ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d };
|
const std::array<uint32_t, 14> pc = { m, n, k, stride_a, stride_b, stride_d, k, ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d };
|
||||||
ggml_vk_dispatch_pipeline(ctx, pipeline, { a, b, d }, pc.size() * sizeof(uint32_t), pc.data(), { m, n, batch });
|
ggml_vk_dispatch_pipeline(ctx, pipeline, { a, b, d }, pc.size() * sizeof(uint32_t), pc.data(), { m, n, batch });
|
||||||
return;
|
return;
|
||||||
|
@ -1945,10 +1915,6 @@ static void ggml_vk_matmul(vk_context * ctx, vk_pipeline& pipeline, vk_subbuffer
|
||||||
|
|
||||||
GGML_ASSERT(batch_stride_d == m * n);
|
GGML_ASSERT(batch_stride_d == m * n);
|
||||||
|
|
||||||
// Synchronize the two submissions
|
|
||||||
ggml_vk_sync_buffers(ctx);
|
|
||||||
ctx->s->buffer.fillBuffer(split_k_buffer.buffer.buffer, 0, split_k_buffer.size, 0);
|
|
||||||
ggml_vk_sync_buffers(ctx);
|
|
||||||
const std::array<uint32_t, 14> pc1 = { m, n, k, stride_a, stride_b, stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d };
|
const std::array<uint32_t, 14> pc1 = { m, n, k, stride_a, stride_b, stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d };
|
||||||
// Make sure enough workgroups get assigned for split k to work
|
// Make sure enough workgroups get assigned for split k to work
|
||||||
ggml_vk_dispatch_pipeline(ctx, pipeline, { a, b, split_k_buffer }, pc1.size() * sizeof(uint32_t), pc1.data(), { (CEIL_DIV(m, pipeline.wg_denoms[0]) * pipeline.wg_denoms[0]) * split_k, n, batch });
|
ggml_vk_dispatch_pipeline(ctx, pipeline, { a, b, split_k_buffer }, pc1.size() * sizeof(uint32_t), pc1.data(), { (CEIL_DIV(m, pipeline.wg_denoms[0]) * pipeline.wg_denoms[0]) * split_k, n, batch });
|
||||||
|
@ -3600,6 +3566,99 @@ static void ggml_vk_test_transfer(size_t ne, bool pinned) {
|
||||||
free(y);
|
free(y);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_vk_test_dequant(size_t ne, ggml_type quant) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_test_dequant(" << ne << ")" << std::endl;
|
||||||
|
#endif
|
||||||
|
const size_t x_sz = sizeof(float) * ne;
|
||||||
|
const size_t x_sz_f16 = sizeof(ggml_fp16_t) * ne;
|
||||||
|
const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
|
||||||
|
float * x = (float *) malloc(x_sz);
|
||||||
|
void * qx = malloc(qx_sz);
|
||||||
|
vk_buffer qx_buf = ggml_vk_create_buffer_check(qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||||
|
vk_buffer x_buf = ggml_vk_create_buffer_check(x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||||
|
ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < ne; i++) {
|
||||||
|
x[i] = rand() / (float)RAND_MAX;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int64_t> hist_cur(1 << 4, 0);
|
||||||
|
|
||||||
|
vk_pipeline& p = vk_pipeline_dequant[quant];
|
||||||
|
|
||||||
|
switch(quant) {
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
|
ggml_quantize_q4_0(x, qx, ne, ne, hist_cur.data());
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q4_1:
|
||||||
|
ggml_quantize_q4_1(x, qx, ne, ne, hist_cur.data());
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
ggml_quantize_q5_0(x, qx, ne, ne, hist_cur.data());
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q5_1:
|
||||||
|
ggml_quantize_q4_1(x, qx, ne, ne, hist_cur.data());
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q8_0:
|
||||||
|
ggml_quantize_q8_0(x, qx, ne, ne, hist_cur.data());
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
ggml_quantize_q2_K(x, qx, ne, ne, hist_cur.data());
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q3_K:
|
||||||
|
ggml_quantize_q3_K(x, qx, ne, ne, hist_cur.data());
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q4_K:
|
||||||
|
ggml_quantize_q4_K(x, qx, ne, ne, hist_cur.data());
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
|
ggml_quantize_q5_K(x, qx, ne, ne, hist_cur.data());
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q6_K:
|
||||||
|
ggml_quantize_q6_K(x, qx, ne, ne, hist_cur.data());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_vk_pipeline_allocate_descriptor_sets(p, 1);
|
||||||
|
|
||||||
|
ggml_vk_buffer_write(&qx_buf, 0, qx, qx_sz);
|
||||||
|
|
||||||
|
vk_context * ctx = ggml_vk_create_context(vk_device.compute_queue);
|
||||||
|
ggml_vk_ctx_begin(ctx);
|
||||||
|
const std::vector<int> pc = { 1, (int)ne, (int)ne, (int)ne };
|
||||||
|
ggml_vk_sync_buffers(ctx);
|
||||||
|
ggml_vk_dispatch_pipeline(ctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
|
||||||
|
ggml_vk_ctx_end(ctx);
|
||||||
|
|
||||||
|
auto begin = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
ggml_vk_submit(ctx, vk_fence);
|
||||||
|
VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
|
||||||
|
vk_device.device.resetFences({ vk_fence });
|
||||||
|
|
||||||
|
auto end = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
double ms_dequant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
||||||
|
ggml_vk_buffer_read(&x_buf, 0, x_chk, x_sz_f16);
|
||||||
|
|
||||||
|
double avg_err = 0.0;
|
||||||
|
for (size_t i = 0; i < ne; i++) {
|
||||||
|
avg_err += std::fabs(x[i] - ggml_fp16_to_fp32(x_chk[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cerr << "TEST DEQUANT " << ggml_type_name(quant) << " time=" << ms_dequant << "ms avg_err=" << avg_err / ne << std::endl;
|
||||||
|
|
||||||
|
ggml_vk_destroy_buffer(x_buf);
|
||||||
|
ggml_vk_destroy_buffer(qx_buf);
|
||||||
|
|
||||||
|
free(x);
|
||||||
|
free(qx);
|
||||||
|
free(x_chk);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) {
|
static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) {
|
||||||
|
@ -3755,6 +3814,17 @@ void ggml_vk_preallocate_buffers() {
|
||||||
ggml_vk_test_transfer(8192 * 1000, false);
|
ggml_vk_test_transfer(8192 * 1000, false);
|
||||||
ggml_vk_test_transfer(8192 * 1000, true);
|
ggml_vk_test_transfer(8192 * 1000, true);
|
||||||
|
|
||||||
|
ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q4_0);
|
||||||
|
ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q4_1);
|
||||||
|
ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q5_0);
|
||||||
|
ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q5_1);
|
||||||
|
ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q8_0);
|
||||||
|
ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q2_K);
|
||||||
|
ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q3_K);
|
||||||
|
ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q4_K);
|
||||||
|
ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q5_K);
|
||||||
|
ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q6_K);
|
||||||
|
|
||||||
const std::vector<size_t> vals {
|
const std::vector<size_t> vals {
|
||||||
8, 8, 8,
|
8, 8, 8,
|
||||||
100, 46, 576,
|
100, 46, 576,
|
||||||
|
|
|
@ -157,19 +157,10 @@ struct block_q6_K
|
||||||
|
|
||||||
# Dequant functions
|
# Dequant functions
|
||||||
shader_f16_dequant_func = """
|
shader_f16_dequant_func = """
|
||||||
#define DEQUANT_FUNC f16vec2 v = f16vec2(data_a[ib + 0], data_a[ib + 1]);
|
|
||||||
"""
|
|
||||||
shader_f16_dequant_func_compat = """
|
|
||||||
#define DEQUANT_FUNC vec2 v = vec2(data_a[ib + 0], data_a[ib + 1]);
|
#define DEQUANT_FUNC vec2 v = vec2(data_a[ib + 0], data_a[ib + 1]);
|
||||||
"""
|
"""
|
||||||
|
|
||||||
shader_q4_0_dequant_func = """
|
shader_q4_0_dequant_func = """
|
||||||
#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \
|
|
||||||
const uint8_t vui = data_a[ib].qs[iqs]; \
|
|
||||||
f16vec2 v = f16vec2(vui & 0xF, vui >> 4); \
|
|
||||||
v = (v - 8.0hf)*d;
|
|
||||||
"""
|
|
||||||
shader_q4_0_dequant_func_compat = """
|
|
||||||
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
|
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
|
||||||
const uint vui = uint(data_a[ib].qs[iqs]); \
|
const uint vui = uint(data_a[ib].qs[iqs]); \
|
||||||
vec2 v = vec2(vui & 0xF, vui >> 4); \
|
vec2 v = vec2(vui & 0xF, vui >> 4); \
|
||||||
|
@ -177,13 +168,6 @@ v = (v - 8.0f)*d;
|
||||||
"""
|
"""
|
||||||
|
|
||||||
shader_q4_1_dequant_func = """
|
shader_q4_1_dequant_func = """
|
||||||
#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \
|
|
||||||
const float16_t m = data_a[ib].m; \
|
|
||||||
const uint8_t vui = data_a[ib].qs[iqs]; \
|
|
||||||
f16vec2 v = f16vec2(vui & 0xF, vui >> 4); \
|
|
||||||
v = v*d + m;
|
|
||||||
"""
|
|
||||||
shader_q4_1_dequant_func_compat = """
|
|
||||||
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
|
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
|
||||||
const float m = float(data_a[ib].m); \
|
const float m = float(data_a[ib].m); \
|
||||||
const uint vui = uint(data_a[ib].qs[iqs]); \
|
const uint vui = uint(data_a[ib].qs[iqs]); \
|
||||||
|
@ -192,14 +176,6 @@ v = v*d + m;
|
||||||
"""
|
"""
|
||||||
|
|
||||||
shader_q5_0_dequant_func = """
|
shader_q5_0_dequant_func = """
|
||||||
#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \
|
|
||||||
const uint uint_qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0]; \
|
|
||||||
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); \
|
|
||||||
const uint8_t vui = data_a[ib].qs[iqs]; \
|
|
||||||
f16vec2 v = f16vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y); \
|
|
||||||
v = (v - 16.0hf) * d;
|
|
||||||
"""
|
|
||||||
shader_q5_0_dequant_func_compat = """
|
|
||||||
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
|
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
|
||||||
const uint uint_qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0]; \
|
const uint uint_qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0]; \
|
||||||
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); \
|
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); \
|
||||||
|
@ -209,14 +185,6 @@ v = (v - 16.0f) * d;
|
||||||
"""
|
"""
|
||||||
|
|
||||||
shader_q5_1_dequant_func = """
|
shader_q5_1_dequant_func = """
|
||||||
#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \
|
|
||||||
const float16_t m = data_a[ib].m; \
|
|
||||||
const ivec2 qh = ivec2(((data_a[ib].qh >> iqs) << 4) & 0x10, (data_a[ib].qh >> (iqs + 12)) & 0x10); \
|
|
||||||
const uint8_t vui = data_a[ib].qs[iqs]; \
|
|
||||||
f16vec2 v = f16vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y); \
|
|
||||||
v = v*d + m;
|
|
||||||
"""
|
|
||||||
shader_q5_1_dequant_func_compat = """
|
|
||||||
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
|
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
|
||||||
const float m = float(data_a[ib].m); \
|
const float m = float(data_a[ib].m); \
|
||||||
const ivec2 qh = ivec2(((data_a[ib].qh >> iqs) << 4) & 0x10, (data_a[ib].qh >> (iqs + 12)) & 0x10); \
|
const ivec2 qh = ivec2(((data_a[ib].qh >> iqs) << 4) & 0x10, (data_a[ib].qh >> (iqs + 12)) & 0x10); \
|
||||||
|
@ -226,11 +194,6 @@ v = v*d + m;
|
||||||
"""
|
"""
|
||||||
|
|
||||||
shader_q8_0_dequant_func = """
|
shader_q8_0_dequant_func = """
|
||||||
#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \
|
|
||||||
f16vec2 v = f16vec2(data_a[ib].qs[iqs], data_a[ib].qs[iqs + 1]); \
|
|
||||||
v = v * d;
|
|
||||||
"""
|
|
||||||
shader_q8_0_dequant_func_compat = """
|
|
||||||
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
|
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
|
||||||
vec2 v = vec2(int(data_a[ib].qs[iqs]), int(data_a[ib].qs[iqs + 1])); \
|
vec2 v = vec2(int(data_a[ib].qs[iqs]), int(data_a[ib].qs[iqs + 1])); \
|
||||||
v = v * d;
|
v = v * d;
|
||||||
|
@ -2110,7 +2073,7 @@ lock = asyncio.Lock()
|
||||||
shader_fnames = []
|
shader_fnames = []
|
||||||
|
|
||||||
|
|
||||||
async def string_to_spv(name, code, defines, fp16):
|
async def string_to_spv(name, code, defines, fp16=True):
|
||||||
f = NamedTemporaryFile(mode="w", delete=False)
|
f = NamedTemporaryFile(mode="w", delete=False)
|
||||||
f.write(code)
|
f.write(code)
|
||||||
f.flush()
|
f.flush()
|
||||||
|
@ -2200,64 +2163,6 @@ async def main():
|
||||||
tasks.append(string_to_spv("matmul_f16_f32_aligned_m", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
|
tasks.append(string_to_spv("matmul_f16_f32_aligned_m", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
|
||||||
tasks.append(string_to_spv("matmul_f16_f32_aligned_s", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
|
tasks.append(string_to_spv("matmul_f16_f32_aligned_s", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
|
||||||
|
|
||||||
# Build dequant shaders
|
|
||||||
tasks.append(string_to_spv("f32_to_f16", f32_to_f16_src, {}, fp16))
|
|
||||||
|
|
||||||
for i in range(0, VK_NUM_TYPES):
|
|
||||||
stream.clear()
|
|
||||||
|
|
||||||
stream.extend((dequant_head, shader_int8_ext, shader_float_type))
|
|
||||||
|
|
||||||
if i == GGML_TYPE_F16:
|
|
||||||
stream.extend((shader_f16_defines, shader_f16_dequant_func_compat if not fp16 else shader_f16_dequant_func, dequant_body))
|
|
||||||
elif i == GGML_TYPE_Q4_0:
|
|
||||||
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func_compat if not fp16 else shader_q4_0_dequant_func, dequant_body))
|
|
||||||
elif i == GGML_TYPE_Q4_1:
|
|
||||||
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func_compat if not fp16 else shader_q4_1_dequant_func, dequant_body))
|
|
||||||
elif i == GGML_TYPE_Q5_0:
|
|
||||||
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func_compat if not fp16 else shader_q5_0_dequant_func, dequant_body))
|
|
||||||
elif i == GGML_TYPE_Q5_1:
|
|
||||||
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func_compat if not fp16 else shader_q5_1_dequant_func, dequant_body))
|
|
||||||
elif i == GGML_TYPE_Q8_0:
|
|
||||||
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func_compat if not fp16 else shader_q8_0_dequant_func, dequant_body))
|
|
||||||
elif i == GGML_TYPE_Q2_K:
|
|
||||||
stream.extend((shader_q2_K_defines, dequant_q2_K_body))
|
|
||||||
elif i == GGML_TYPE_Q3_K:
|
|
||||||
stream.extend((shader_q3_K_defines, dequant_q3_K_body))
|
|
||||||
elif i == GGML_TYPE_Q4_K:
|
|
||||||
stream.extend((shader_q4_K_defines, dequant_q4_K_body))
|
|
||||||
elif i == GGML_TYPE_Q5_K:
|
|
||||||
stream.extend((shader_q5_K_defines, dequant_q5_K_body))
|
|
||||||
elif i == GGML_TYPE_Q6_K:
|
|
||||||
stream.extend((shader_q6_K_defines, dequant_q6_K_body))
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
|
|
||||||
tasks.append(string_to_spv(f"dequant_{type_names[i]}", "".join(stream), {"D_TYPE": "float16_t"}, fp16))
|
|
||||||
|
|
||||||
# get_rows
|
|
||||||
for i in range(0, VK_NUM_TYPES):
|
|
||||||
stream.clear()
|
|
||||||
stream.extend((generic_head, shader_int8_ext, shader_float_type))
|
|
||||||
|
|
||||||
if i == GGML_TYPE_F16:
|
|
||||||
stream.extend((shader_f16_defines, shader_f16_dequant_func_compat if not fp16 else shader_f16_dequant_func, get_rows_body))
|
|
||||||
elif i == GGML_TYPE_Q4_0:
|
|
||||||
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func_compat if not fp16 else shader_q4_0_dequant_func, get_rows_body))
|
|
||||||
elif i == GGML_TYPE_Q4_1:
|
|
||||||
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func_compat if not fp16 else shader_q4_1_dequant_func, get_rows_body))
|
|
||||||
elif i == GGML_TYPE_Q5_0:
|
|
||||||
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func_compat if not fp16 else shader_q5_0_dequant_func, get_rows_body))
|
|
||||||
elif i == GGML_TYPE_Q5_1:
|
|
||||||
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func_compat if not fp16 else shader_q5_1_dequant_func, get_rows_body))
|
|
||||||
elif i == GGML_TYPE_Q8_0:
|
|
||||||
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func_compat if not fp16 else shader_q8_0_dequant_func, get_rows_body))
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
|
|
||||||
tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float16_t"}, fp16))
|
|
||||||
tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float"}, fp16))
|
|
||||||
|
|
||||||
# Shaders where precision is needed, so no fp16 version
|
# Shaders where precision is needed, so no fp16 version
|
||||||
|
|
||||||
# mul mat vec
|
# mul mat vec
|
||||||
|
@ -2266,17 +2171,17 @@ async def main():
|
||||||
stream.extend((mul_mat_vec_head, shader_int8_ext, shader_f32))
|
stream.extend((mul_mat_vec_head, shader_int8_ext, shader_f32))
|
||||||
|
|
||||||
if i == GGML_TYPE_F16:
|
if i == GGML_TYPE_F16:
|
||||||
stream.extend((shader_f16_defines, shader_f16_dequant_func_compat, mul_mat_vec_body))
|
stream.extend((shader_f16_defines, shader_f16_dequant_func, mul_mat_vec_body))
|
||||||
elif i == GGML_TYPE_Q4_0:
|
elif i == GGML_TYPE_Q4_0:
|
||||||
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func_compat, mul_mat_vec_body))
|
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, mul_mat_vec_body))
|
||||||
elif i == GGML_TYPE_Q4_1:
|
elif i == GGML_TYPE_Q4_1:
|
||||||
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func_compat, mul_mat_vec_body))
|
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, mul_mat_vec_body))
|
||||||
elif i == GGML_TYPE_Q5_0:
|
elif i == GGML_TYPE_Q5_0:
|
||||||
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func_compat, mul_mat_vec_body))
|
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, mul_mat_vec_body))
|
||||||
elif i == GGML_TYPE_Q5_1:
|
elif i == GGML_TYPE_Q5_1:
|
||||||
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func_compat, mul_mat_vec_body))
|
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, mul_mat_vec_body))
|
||||||
elif i == GGML_TYPE_Q8_0:
|
elif i == GGML_TYPE_Q8_0:
|
||||||
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func_compat, mul_mat_vec_body))
|
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, mul_mat_vec_body))
|
||||||
elif i == GGML_TYPE_Q2_K:
|
elif i == GGML_TYPE_Q2_K:
|
||||||
stream.extend((shader_q2_K_defines, mul_mat_vec_q2_K_body))
|
stream.extend((shader_q2_K_defines, mul_mat_vec_q2_K_body))
|
||||||
elif i == GGML_TYPE_Q3_K:
|
elif i == GGML_TYPE_Q3_K:
|
||||||
|
@ -2290,43 +2195,102 @@ async def main():
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
tasks.append(string_to_spv(f"mul_mat_vec_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float", "K_QUANTS_PER_ITERATION": K_QUANTS_PER_ITERATION}, fp16))
|
tasks.append(string_to_spv(f"mul_mat_vec_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float", "K_QUANTS_PER_ITERATION": K_QUANTS_PER_ITERATION}))
|
||||||
|
|
||||||
tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", mul_mat_p021_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}, True))
|
# Dequant shaders
|
||||||
tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", mul_mat_nc_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}, True))
|
for i in range(0, VK_NUM_TYPES):
|
||||||
|
stream.clear()
|
||||||
|
|
||||||
|
stream.extend((dequant_head, shader_int8_ext, shader_f32))
|
||||||
|
|
||||||
|
if i == GGML_TYPE_F16:
|
||||||
|
stream.extend((shader_f16_defines, shader_f16_dequant_func, dequant_body))
|
||||||
|
elif i == GGML_TYPE_Q4_0:
|
||||||
|
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, dequant_body))
|
||||||
|
elif i == GGML_TYPE_Q4_1:
|
||||||
|
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, dequant_body))
|
||||||
|
elif i == GGML_TYPE_Q5_0:
|
||||||
|
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, dequant_body))
|
||||||
|
elif i == GGML_TYPE_Q5_1:
|
||||||
|
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, dequant_body))
|
||||||
|
elif i == GGML_TYPE_Q8_0:
|
||||||
|
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, dequant_body))
|
||||||
|
elif i == GGML_TYPE_Q2_K:
|
||||||
|
stream.extend((shader_q2_K_defines, dequant_q2_K_body))
|
||||||
|
elif i == GGML_TYPE_Q3_K:
|
||||||
|
stream.extend((shader_q3_K_defines, dequant_q3_K_body))
|
||||||
|
elif i == GGML_TYPE_Q4_K:
|
||||||
|
stream.extend((shader_q4_K_defines, dequant_q4_K_body))
|
||||||
|
elif i == GGML_TYPE_Q5_K:
|
||||||
|
stream.extend((shader_q5_K_defines, dequant_q5_K_body))
|
||||||
|
elif i == GGML_TYPE_Q6_K:
|
||||||
|
stream.extend((shader_q6_K_defines, dequant_q6_K_body))
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
tasks.append(string_to_spv(f"dequant_{type_names[i]}", "".join(stream), {"D_TYPE": "float16_t"}))
|
||||||
|
|
||||||
|
tasks.append(string_to_spv("f32_to_f16", f32_to_f16_src, {}))
|
||||||
|
|
||||||
|
# get_rows
|
||||||
|
for i in range(0, VK_NUM_TYPES):
|
||||||
|
stream.clear()
|
||||||
|
stream.extend((generic_head, shader_int8_ext, shader_f32))
|
||||||
|
|
||||||
|
if i == GGML_TYPE_F16:
|
||||||
|
stream.extend((shader_f16_defines, shader_f16_dequant_func, get_rows_body))
|
||||||
|
elif i == GGML_TYPE_Q4_0:
|
||||||
|
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, get_rows_body))
|
||||||
|
elif i == GGML_TYPE_Q4_1:
|
||||||
|
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, get_rows_body))
|
||||||
|
elif i == GGML_TYPE_Q5_0:
|
||||||
|
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, get_rows_body))
|
||||||
|
elif i == GGML_TYPE_Q5_1:
|
||||||
|
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, get_rows_body))
|
||||||
|
elif i == GGML_TYPE_Q8_0:
|
||||||
|
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, get_rows_body))
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float16_t"}))
|
||||||
|
tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float"}))
|
||||||
|
|
||||||
|
|
||||||
|
tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", mul_mat_p021_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||||
|
tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", mul_mat_nc_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||||
|
|
||||||
# Norms
|
# Norms
|
||||||
tasks.append(string_to_spv("norm_f32", f"{generic_head}\n{shader_f32}\n{norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
tasks.append(string_to_spv("norm_f32", f"{generic_head}\n{shader_f32}\n{norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||||
tasks.append(string_to_spv("rms_norm_f32", f"{generic_head}\n{shader_f32}\n{rms_norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
tasks.append(string_to_spv("rms_norm_f32", f"{generic_head}\n{shader_f32}\n{rms_norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||||
|
|
||||||
tasks.append(string_to_spv("cpy_f32_f32", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
tasks.append(string_to_spv("cpy_f32_f32", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||||
tasks.append(string_to_spv("cpy_f32_f16", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}, True))
|
tasks.append(string_to_spv("cpy_f32_f16", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
|
||||||
tasks.append(string_to_spv("cpy_f16_f16", f"{cpy_src}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}, True))
|
tasks.append(string_to_spv("cpy_f16_f16", f"{cpy_src}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
|
||||||
|
|
||||||
tasks.append(string_to_spv("add_f32", f"{generic_head}\n{shader_f32}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, True))
|
tasks.append(string_to_spv("add_f32", f"{generic_head}\n{shader_f32}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||||
|
|
||||||
tasks.append(string_to_spv("split_k_reduce", mulmat_split_k_reduce_src, {}, True))
|
tasks.append(string_to_spv("split_k_reduce", mulmat_split_k_reduce_src, {}))
|
||||||
tasks.append(string_to_spv("mul_f32", f"{generic_head}\n{shader_f32}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, True))
|
tasks.append(string_to_spv("mul_f32", f"{generic_head}\n{shader_f32}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||||
|
|
||||||
tasks.append(string_to_spv("scale_f32", f"{generic_head}\n{shader_f32}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
tasks.append(string_to_spv("scale_f32", f"{generic_head}\n{shader_f32}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||||
|
|
||||||
tasks.append(string_to_spv("sqr_f32", f"{generic_head}\n{shader_f32}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
tasks.append(string_to_spv("sqr_f32", f"{generic_head}\n{shader_f32}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||||
|
|
||||||
tasks.append(string_to_spv("clamp_f32", f"{generic_head}\n{shader_f32}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
tasks.append(string_to_spv("clamp_f32", f"{generic_head}\n{shader_f32}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||||
|
|
||||||
tasks.append(string_to_spv("gelu_f32", f"{generic_head}\n{shader_f32}\n{gelu_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
tasks.append(string_to_spv("gelu_f32", f"{generic_head}\n{shader_f32}\n{gelu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||||
tasks.append(string_to_spv("silu_f32", f"{generic_head}\n{shader_f32}\n{silu_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
tasks.append(string_to_spv("silu_f32", f"{generic_head}\n{shader_f32}\n{silu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||||
tasks.append(string_to_spv("relu_f32", f"{generic_head}\n{shader_f32}\n{relu_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
tasks.append(string_to_spv("relu_f32", f"{generic_head}\n{shader_f32}\n{relu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||||
|
|
||||||
tasks.append(string_to_spv("diag_mask_inf_f32", f"{diag_mask_inf_head}\n{shader_f32}\n{diag_mask_inf_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
tasks.append(string_to_spv("diag_mask_inf_f32", f"{diag_mask_inf_head}\n{shader_f32}\n{diag_mask_inf_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||||
|
|
||||||
tasks.append(string_to_spv("soft_max_f32", f"{generic_head}\n{shader_f32}\n{soft_max_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, True))
|
tasks.append(string_to_spv("soft_max_f32", f"{generic_head}\n{shader_f32}\n{soft_max_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||||
|
|
||||||
tasks.append(string_to_spv("rope_f32", rope_src, {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
tasks.append(string_to_spv("rope_f32", rope_src, {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||||
tasks.append(string_to_spv("rope_f16", rope_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}, True))
|
tasks.append(string_to_spv("rope_f16", rope_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
|
||||||
|
|
||||||
tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||||
tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}, True))
|
tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
|
||||||
|
|
||||||
await asyncio.gather(*tasks)
|
await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue