Fix empty Vulkan host buffers
Add fp32 fp16 matmul shader Fix matmul shader alignment
This commit is contained in:
parent
c1b295eea5
commit
179103c23d
3 changed files with 7966 additions and 4285 deletions
12204
ggml-vulkan-shaders.hpp
12204
ggml-vulkan-shaders.hpp
File diff suppressed because it is too large
Load diff
|
@ -114,6 +114,7 @@ struct vk_device {
|
|||
size_t idx;
|
||||
|
||||
vk_matmul_pipeline pipeline_matmul_f32;
|
||||
vk_matmul_pipeline pipeline_matmul_f32_f16;
|
||||
vk_matmul_pipeline pipeline_matmul_f16;
|
||||
vk_matmul_pipeline pipeline_matmul_f16_f32;
|
||||
vk_pipeline pipeline_matmul_split_k_reduce;
|
||||
|
@ -1013,6 +1014,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|||
uint32_t s_align = 32;
|
||||
|
||||
ctx->device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
ctx->device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
ctx->device->pipeline_matmul_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
ctx->device->pipeline_matmul_f16 = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
|
@ -1048,6 +1050,13 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
||||
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
||||
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
||||
|
@ -1230,6 +1239,13 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
||||
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
||||
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
||||
|
@ -1903,6 +1919,9 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
|||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_matmul_f32;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
|
||||
return ctx->device->pipeline_matmul_f32_f16;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_matmul_f16_f32;
|
||||
}
|
||||
|
@ -4377,6 +4396,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|||
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
||||
p = ctx->device->pipeline_matmul_f32->a_s;
|
||||
shname = "F32_ALIGNED_S";
|
||||
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
||||
p = ctx->device->pipeline_matmul_f32_f16->a_s;
|
||||
shname = "F32_F16_ALIGNED_S";
|
||||
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
||||
p = ctx->device->pipeline_matmul_f16_f32->a_s;
|
||||
shname = "F16_F32_ALIGNED_S";
|
||||
|
@ -4390,6 +4412,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|||
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
||||
p = ctx->device->pipeline_matmul_f32->a_m;
|
||||
shname = "F32_ALIGNED_M";
|
||||
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
||||
p = ctx->device->pipeline_matmul_f32_f16->a_m;
|
||||
shname = "F32_F16_ALIGNED_M";
|
||||
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
||||
p = ctx->device->pipeline_matmul_f16_f32->a_m;
|
||||
shname = "F16_F32_ALIGNED_M";
|
||||
|
@ -4403,6 +4428,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|||
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
||||
p = ctx->device->pipeline_matmul_f32->a_l;
|
||||
shname = "F32_ALIGNED_L";
|
||||
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
||||
p = ctx->device->pipeline_matmul_f32_f16->a_l;
|
||||
shname = "F32_F16_ALIGNED_L";
|
||||
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
||||
p = ctx->device->pipeline_matmul_f16_f32->a_l;
|
||||
shname = "F16_F32_ALIGNED_L";
|
||||
|
@ -4423,6 +4451,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|||
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
||||
p = ctx->device->pipeline_matmul_f32->s;
|
||||
shname = "F32_S";
|
||||
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
||||
p = ctx->device->pipeline_matmul_f32_f16->s;
|
||||
shname = "F32_F16_S";
|
||||
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
||||
p = ctx->device->pipeline_matmul_f16_f32->s;
|
||||
shname = "F16_F32_S";
|
||||
|
@ -4434,6 +4465,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|||
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
||||
p = ctx->device->pipeline_matmul_f32->m;
|
||||
shname = "F32_M";
|
||||
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
||||
p = ctx->device->pipeline_matmul_f32_f16->m;
|
||||
shname = "F32_F16_M";
|
||||
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
||||
p = ctx->device->pipeline_matmul_f16_f32->m;
|
||||
shname = "F16_F32_M";
|
||||
|
@ -4445,6 +4479,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|||
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
||||
p = ctx->device->pipeline_matmul_f32->l;
|
||||
shname = "F32_L";
|
||||
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
||||
p = ctx->device->pipeline_matmul_f32_f16->l;
|
||||
shname = "F32_F16_L";
|
||||
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
||||
p = ctx->device->pipeline_matmul_f16_f32->l;
|
||||
shname = "F16_F32_L";
|
||||
|
@ -6027,6 +6064,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu
|
|||
#ifdef GGML_VULKAN_DEBUG
|
||||
std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
||||
#endif
|
||||
size += 32; // Behave like the CPU buffer type
|
||||
void * ptr = nullptr;
|
||||
try {
|
||||
ptr = ggml_vk_host_malloc(&vk_instance.contexts[0], size);
|
||||
|
|
|
@ -2877,13 +2877,16 @@ async def main():
|
|||
stream.clear()
|
||||
stream.extend((mulmat_head, shader_float_type, mulmat_body1, mulmat_load_scalar, mulmat_body2))
|
||||
tasks.append(string_to_spv("matmul_f32", "".join(stream), {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
|
||||
tasks.append(string_to_spv("matmul_f32_aligned", "".join(stream), {"LOAD_VEC_A": 1, "LOAD_VEC_B": load_vec, "A_TYPE": "float", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
|
||||
tasks.append(string_to_spv("matmul_f32_aligned", "".join(stream), {"LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "A_TYPE": vec_type, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
|
||||
|
||||
tasks.append(string_to_spv("matmul_f32_f16", "".join(stream), {"A_TYPE": "float", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
|
||||
tasks.append(string_to_spv("matmul_f32_f16_aligned", "".join(stream), {"LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "A_TYPE": vec_type, "B_TYPE": vec_type_f16, "D_TYPE": "float"}, fp16))
|
||||
|
||||
tasks.append(string_to_spv("matmul_f16", "".join(stream), {"A_TYPE": "float16_t", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
|
||||
tasks.append(string_to_spv("matmul_f16_aligned", "".join(stream), {"LOAD_VEC_A": 1, "LOAD_VEC_B": load_vec, "A_TYPE": "float16_t", "B_TYPE": vec_type_f16, "D_TYPE": "float"}, fp16))
|
||||
tasks.append(string_to_spv("matmul_f16_aligned", "".join(stream), {"LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type_f16, "D_TYPE": "float"}, fp16))
|
||||
|
||||
tasks.append(string_to_spv("matmul_f16_f32", "".join(stream), {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
|
||||
tasks.append(string_to_spv("matmul_f16_f32_aligned", "".join(stream), {"LOAD_VEC_A": 1, "LOAD_VEC_B": load_vec, "A_TYPE": "float16_t", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
|
||||
tasks.append(string_to_spv("matmul_f16_f32_aligned", "".join(stream), {"LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
|
||||
|
||||
stream.clear()
|
||||
stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q4_0_defines, mulmat_body1, mulmat_load_q4_0, mulmat_body2))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue