Disable MoE code (not ready yet), fix a number of bugs in shaders and Vulkan code
This commit is contained in:
parent
3098206b00
commit
f3dc70402c
3 changed files with 18250 additions and 11684 deletions
29679
ggml-vulkan-shaders.hpp
29679
ggml-vulkan-shaders.hpp
File diff suppressed because it is too large
Load diff
121
ggml-vulkan.cpp
121
ggml-vulkan.cpp
|
@ -127,7 +127,8 @@ struct vk_device {
|
||||||
vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[VK_NUM_TYPES];
|
vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[VK_NUM_TYPES];
|
||||||
|
|
||||||
vk_pipeline pipeline_dequant[VK_NUM_TYPES];
|
vk_pipeline pipeline_dequant[VK_NUM_TYPES];
|
||||||
vk_pipeline pipeline_dequant_mul_mat_vec_f32[VK_NUM_TYPES];
|
vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[VK_NUM_TYPES];
|
||||||
|
vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[VK_NUM_TYPES];
|
||||||
vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[VK_NUM_TYPES];
|
vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[VK_NUM_TYPES];
|
||||||
|
|
||||||
vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
|
vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
|
||||||
|
@ -235,8 +236,6 @@ struct vk_mat_vec_push_constants {
|
||||||
uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
|
uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
|
||||||
uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3;
|
uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3;
|
||||||
uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
|
uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
|
||||||
uint32_t expert_stride_b; uint32_t expert_stride_d;
|
|
||||||
uint32_t idx; uint32_t nbi1; uint32_t n_as;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct vk_op_push_constants {
|
struct vk_op_push_constants {
|
||||||
|
@ -1408,17 +1407,29 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// mul mat vec
|
// mul mat vec
|
||||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32", mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32", mul_mat_vec_q4_0_f32_len, mul_mat_vec_q4_0_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32", mul_mat_vec_q4_1_f32_len, mul_mat_vec_q4_1_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32", mul_mat_vec_q5_0_f32_len, mul_mat_vec_q5_0_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32", mul_mat_vec_q5_1_f32_len, mul_mat_vec_q5_1_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32", mul_mat_vec_q8_0_f32_len, mul_mat_vec_q8_0_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f32", mul_mat_vec_q2_K_f32_len, mul_mat_vec_q2_K_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f32_f32", mul_mat_vec_q2_K_f32_f32_len, mul_mat_vec_q2_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f32", mul_mat_vec_q3_K_f32_len, mul_mat_vec_q3_K_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f32_f32", mul_mat_vec_q3_K_f32_f32_len, mul_mat_vec_q3_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f32", mul_mat_vec_q4_K_f32_len, mul_mat_vec_q4_K_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f32_f32", mul_mat_vec_q4_K_f32_f32_len, mul_mat_vec_q4_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f32", mul_mat_vec_q5_K_f32_len, mul_mat_vec_q5_K_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f32_f32", mul_mat_vec_q5_K_f32_f32_len, mul_mat_vec_q5_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f32", mul_mat_vec_q6_K_f32_len, mul_mat_vec_q6_K_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f32_f32", mul_mat_vec_q6_K_f32_f32_len, mul_mat_vec_q6_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
|
|
||||||
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f16_f32", mul_mat_vec_q2_K_f16_f32_len, mul_mat_vec_q2_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f16_f32", mul_mat_vec_q3_K_f16_f32_len, mul_mat_vec_q3_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f16_f32", mul_mat_vec_q4_K_f16_f32_len, mul_mat_vec_q4_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f16_f32", mul_mat_vec_q5_K_f16_f32_len, mul_mat_vec_q5_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f16_f32", mul_mat_vec_q6_K_f16_f32_len, mul_mat_vec_q6_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
|
|
||||||
/*ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
/*ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||||
|
@ -1954,11 +1965,13 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
|
||||||
return ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type];
|
return ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type];
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type type) {
|
static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
|
std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
switch (type) {
|
GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16);
|
||||||
|
|
||||||
|
switch (a_type) {
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
|
@ -1975,7 +1988,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
return ctx->device->pipeline_dequant_mul_mat_vec_f32[type];
|
return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[a_type] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[a_type];
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) {
|
static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) {
|
||||||
|
@ -2181,6 +2194,9 @@ static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_vk_align_size(size_t width, size_t align) {
|
static size_t ggml_vk_align_size(size_t width, size_t align) {
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << "ggml_vk_align_size(" << width << ", " << align << ")" << std::endl;
|
||||||
|
#endif
|
||||||
return CEIL_DIV(width, align) * align;
|
return CEIL_DIV(width, align) * align;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2636,11 +2652,13 @@ static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")" << std::endl;
|
std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) {
|
// if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) {
|
||||||
return 4;
|
// return 4;
|
||||||
}
|
// }
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
|
GGML_UNUSED(m); GGML_UNUSED(n); GGML_UNUSED(k);
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_pipeline ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) {
|
static vk_pipeline ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) {
|
||||||
|
@ -2692,7 +2710,7 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << "ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")" << std::endl;
|
std::cerr << "ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, false)->align;
|
return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true)->align;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_matmul(
|
static void ggml_vk_matmul(
|
||||||
|
@ -2858,7 +2876,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
||||||
const int d_ne = ne11 * ne01;
|
const int d_ne = ne11 * ne01;
|
||||||
|
|
||||||
const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11));
|
const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11));
|
||||||
const bool aligned = ne10 == kpad;
|
const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8;
|
||||||
|
|
||||||
const uint32_t split_k = ggml_vk_guess_split_k(ne01, ne11, ne10);
|
const uint32_t split_k = ggml_vk_guess_split_k(ne01, ne11, ne10);
|
||||||
|
|
||||||
|
@ -2997,9 +3015,6 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
||||||
const uint64_t ne22 = dst->ne[2];
|
const uint64_t ne22 = dst->ne[2];
|
||||||
const uint64_t ne23 = dst->ne[3];
|
const uint64_t ne23 = dst->ne[3];
|
||||||
|
|
||||||
const uint64_t nb22 = dst->nb[2];
|
|
||||||
const uint64_t nb23 = dst->nb[3];
|
|
||||||
|
|
||||||
const uint64_t r2 = ne12 / ne02;
|
const uint64_t r2 = ne12 / ne02;
|
||||||
const uint64_t r3 = ne13 / ne03;
|
const uint64_t r3 = ne13 / ne03;
|
||||||
|
|
||||||
|
@ -3085,7 +3100,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
||||||
} else {
|
} else {
|
||||||
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
||||||
}
|
}
|
||||||
vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type);
|
vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type);
|
||||||
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
||||||
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
||||||
GGML_ASSERT(dmmv != nullptr);
|
GGML_ASSERT(dmmv != nullptr);
|
||||||
|
@ -3124,7 +3139,6 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
||||||
(uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
|
(uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
|
||||||
(uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
|
(uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
|
||||||
stride_batch_x, stride_batch_y, (uint32_t)(ne20*ne21),
|
stride_batch_x, stride_batch_y, (uint32_t)(ne20*ne21),
|
||||||
0, 0, 0, 0, 1
|
|
||||||
};
|
};
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} }, sizeof(vk_mat_vec_push_constants), &pc, { (uint32_t)ne01, (uint32_t)(ne12 * ne13), 1});
|
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} }, sizeof(vk_mat_vec_push_constants), &pc, { (uint32_t)ne01, (uint32_t)(ne12 * ne13), 1});
|
||||||
|
@ -3324,7 +3338,7 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
/*static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << "ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
std::cerr << "ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
||||||
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
||||||
|
@ -3400,7 +3414,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
||||||
const int d_ne = ne11 * ne01;
|
const int d_ne = ne11 * ne01;
|
||||||
|
|
||||||
const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11));
|
const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11));
|
||||||
const bool aligned = ne10 == kpad;
|
const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8;
|
||||||
|
|
||||||
const uint32_t split_k = ggml_vk_guess_split_k(ne01, ne11, ne10);
|
const uint32_t split_k = ggml_vk_guess_split_k(ne01, ne11, ne10);
|
||||||
|
|
||||||
|
@ -3621,7 +3635,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
||||||
} else {
|
} else {
|
||||||
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
||||||
}
|
}
|
||||||
vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type);
|
vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type);
|
||||||
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
||||||
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
||||||
GGML_ASSERT(dmmv != nullptr);
|
GGML_ASSERT(dmmv != nullptr);
|
||||||
|
@ -3660,11 +3674,11 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
||||||
(uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
|
(uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
|
||||||
(uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
|
(uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
|
||||||
stride_batch_x, stride_batch_y, (uint32_t)(ne20*ne21),
|
stride_batch_x, stride_batch_y, (uint32_t)(ne20*ne21),
|
||||||
0, 0, 0, 0, 1
|
// 0, 0, 0, 0, 1
|
||||||
};
|
};
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} }, sizeof(vk_mat_vec_push_constants), &pc, { (uint32_t)ne01, (uint32_t)(ne12 * ne13), 1});
|
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} }, sizeof(vk_mat_vec_push_constants), &pc, { (uint32_t)ne01, (uint32_t)(ne12 * ne13), 1});
|
||||||
}
|
}*/
|
||||||
|
|
||||||
static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
// guaranteed to be an integer due to the check in ggml_can_repeat
|
// guaranteed to be an integer due to the check in ggml_can_repeat
|
||||||
|
@ -3869,6 +3883,21 @@ static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
||||||
|
switch (op) {
|
||||||
|
case GGML_OP_CPY:
|
||||||
|
case GGML_OP_GET_ROWS:
|
||||||
|
case GGML_OP_ADD:
|
||||||
|
case GGML_OP_MUL:
|
||||||
|
case GGML_OP_SCALE:
|
||||||
|
case GGML_OP_SQR:
|
||||||
|
case GGML_OP_CLAMP:
|
||||||
|
return true;
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template<typename PC>
|
template<typename PC>
|
||||||
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
@ -3920,6 +3949,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const bool op_supports_incontiguous = ggml_vk_op_supports_incontiguous(op);
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||||
ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||||
|
@ -3981,7 +4012,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||||
GGML_ASSERT(d_Z != nullptr);
|
GGML_ASSERT(d_Z != nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS) {
|
if (op_supports_incontiguous) {
|
||||||
x_sz = ggml_nbytes(src0);
|
x_sz = ggml_nbytes(src0);
|
||||||
y_sz = use_src1 ? ggml_nbytes(src1) : 0;
|
y_sz = use_src1 ? ggml_nbytes(src1) : 0;
|
||||||
d_sz = ggml_nbytes(dst);
|
d_sz = ggml_nbytes(dst);
|
||||||
|
@ -4000,7 +4031,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||||
std::array<uint32_t, 3> elements;
|
std::array<uint32_t, 3> elements;
|
||||||
|
|
||||||
// Single call if dimension 2 is contiguous
|
// Single call if dimension 2 is contiguous
|
||||||
if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
|
if (op_supports_incontiguous || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
|
||||||
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1);
|
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1);
|
||||||
|
|
||||||
switch (dst->op) {
|
switch (dst->op) {
|
||||||
|
@ -4021,7 +4052,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (op != GGML_OP_CPY && op != GGML_OP_GET_ROWS) {
|
if (!op_supports_incontiguous) {
|
||||||
if (x_sz != VK_WHOLE_SIZE) {
|
if (x_sz != VK_WHOLE_SIZE) {
|
||||||
x_sz *= ne02 * ne03;
|
x_sz *= ne02 * ne03;
|
||||||
}
|
}
|
||||||
|
@ -4039,14 +4070,14 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||||
if (use_src1) {
|
if (use_src1) {
|
||||||
subbuf_y = { d_Y, y_buf_offset, y_sz };
|
subbuf_y = { d_Y, y_buf_offset, y_sz };
|
||||||
} else {
|
} else {
|
||||||
subbuf_y = { ctx->prealloc_y, 0, ctx->prealloc_y->size };
|
subbuf_y = { d_X, 0, d_X->size };
|
||||||
}
|
}
|
||||||
|
|
||||||
vk_subbuffer subbuf_z;
|
vk_subbuffer subbuf_z;
|
||||||
if (use_src2) {
|
if (use_src2) {
|
||||||
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
||||||
} else {
|
} else {
|
||||||
subbuf_z = { ctx->prealloc_y, 0, ctx->prealloc_y->size };
|
subbuf_z = { d_X, 0, d_X->size };
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
|
@ -4218,7 +4249,9 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f });
|
float * op_params = (float *)dst->op_params;
|
||||||
|
|
||||||
|
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||||
|
@ -4253,7 +4286,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
|
||||||
|
|
||||||
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_SOFT_MAX, {
|
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_SOFT_MAX, {
|
||||||
ncols,
|
ncols,
|
||||||
nrows_y,
|
src1 != nullptr ? nrows_y : (uint32_t)0,
|
||||||
src2 != nullptr ? (uint32_t)1 : (uint32_t)0,
|
src2 != nullptr ? (uint32_t)1 : (uint32_t)0,
|
||||||
scale, max_bias,
|
scale, max_bias,
|
||||||
m0, m1,
|
m0, m1,
|
||||||
|
@ -5232,6 +5265,8 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
||||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_K);
|
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_K);
|
||||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q6_K);
|
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q6_K);
|
||||||
|
|
||||||
|
ggml_vk_test_matmul<ggml_fp16_t, ggml_fp16_t>(ctx, 512, 512, 100, 32, 100, 1, 2);
|
||||||
|
|
||||||
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 0);
|
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 0);
|
||||||
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 1);
|
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 1);
|
||||||
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 2);
|
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 2);
|
||||||
|
@ -6219,7 +6254,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context;
|
// ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context;
|
||||||
|
|
||||||
switch (op->op) {
|
switch (op->op) {
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
|
@ -6843,7 +6878,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
tensor_clone = ggml_soft_max(ggml_ctx, src0_clone);
|
tensor_clone = ggml_soft_max(ggml_ctx, src0_clone);
|
||||||
}
|
}
|
||||||
} else if (tensor->op == GGML_OP_DIAG_MASK_INF) {
|
} else if (tensor->op == GGML_OP_DIAG_MASK_INF) {
|
||||||
tensor_clone = ggml_diag_mask_inf(ggml_ctx, src0_clone, *(float *)tensor->op_params);
|
tensor_clone = ggml_diag_mask_inf(ggml_ctx, src0_clone, *(int *)tensor->op_params);
|
||||||
} else if (tensor->op == GGML_OP_ROPE) {
|
} else if (tensor->op == GGML_OP_ROPE) {
|
||||||
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
||||||
const int mode = ((int32_t *) tensor->op_params)[2];
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
||||||
|
@ -6891,6 +6926,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
tensor_clone = ggml_transpose(ggml_ctx, src0_clone);
|
tensor_clone = ggml_transpose(ggml_ctx, src0_clone);
|
||||||
} else if (tensor->op == GGML_OP_GET_ROWS) {
|
} else if (tensor->op == GGML_OP_GET_ROWS) {
|
||||||
tensor_clone = ggml_get_rows(ggml_ctx, src0_clone, src1_clone);
|
tensor_clone = ggml_get_rows(ggml_ctx, src0_clone, src1_clone);
|
||||||
|
} else if (tensor->op == GGML_OP_ARGSORT) {
|
||||||
|
tensor_clone = ggml_argsort(ggml_ctx, src0_clone, (ggml_sort_order) *(int *)tensor->op_params);
|
||||||
} else {
|
} else {
|
||||||
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
@ -6924,7 +6961,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
free(src1_buffer);
|
free(src1_buffer);
|
||||||
}
|
}
|
||||||
if (src2 != nullptr) {
|
if (src2 != nullptr) {
|
||||||
free(src1_buffer);
|
free(src2_buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_free(ggml_ctx);
|
ggml_free(ggml_ctx);
|
||||||
|
|
|
@ -164,53 +164,53 @@ struct block_q6_K
|
||||||
|
|
||||||
# Dequant functions
|
# Dequant functions
|
||||||
shader_float_dequant_func = """
|
shader_float_dequant_func = """
|
||||||
vec2 dequantize(uint ib, uint iqs) {
|
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
return vec2(data_a[ib], data_a[ib + 1]);
|
return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]);
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
shader_q4_0_dequant_func = """
|
shader_q4_0_dequant_func = """
|
||||||
vec2 dequantize(uint ib, uint iqs) {
|
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
const float d = float(data_a[ib].d);
|
const float d = float(data_a[a_offset + ib].d);
|
||||||
const uint vui = uint(data_a[ib].qs[iqs]);
|
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||||
return (vec2(vui & 0xF, vui >> 4) - 8.0f) * d;
|
return (vec2(vui & 0xF, vui >> 4) - 8.0f) * d;
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
shader_q4_1_dequant_func = """
|
shader_q4_1_dequant_func = """
|
||||||
vec2 dequantize(uint ib, uint iqs) {
|
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
const float d = float(data_a[ib].d);
|
const float d = float(data_a[a_offset + ib].d);
|
||||||
const float m = float(data_a[ib].m);
|
const float m = float(data_a[a_offset + ib].m);
|
||||||
const uint vui = uint(data_a[ib].qs[iqs]);
|
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||||
return vec2(vui & 0xF, vui >> 4) * d + m;
|
return vec2(vui & 0xF, vui >> 4) * d + m;
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
shader_q5_0_dequant_func = """
|
shader_q5_0_dequant_func = """
|
||||||
vec2 dequantize(uint ib, uint iqs) {
|
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
const float d = float(data_a[ib].d);
|
const float d = float(data_a[a_offset + ib].d);
|
||||||
const uint uint_qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0];
|
const uint uint_qh = uint(data_a[a_offset + ib].qh[1]) << 16 | data_a[a_offset + ib].qh[0];
|
||||||
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
||||||
const uint vui = uint(data_a[ib].qs[iqs]);
|
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||||
return (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f) * d;
|
return (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f) * d;
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
shader_q5_1_dequant_func = """
|
shader_q5_1_dequant_func = """
|
||||||
vec2 dequantize(uint ib, uint iqs) {
|
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
const float d = float(data_a[ib].d);
|
const float d = float(data_a[a_offset + ib].d);
|
||||||
const float m = float(data_a[ib].m);
|
const float m = float(data_a[a_offset + ib].m);
|
||||||
const uint uint_qh = data_a[ib].qh;
|
const uint uint_qh = data_a[a_offset + ib].qh;
|
||||||
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
||||||
const uint vui = uint(data_a[ib].qs[iqs]);
|
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||||
return vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) * d + m;
|
return vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) * d + m;
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
shader_q8_0_dequant_func = """
|
shader_q8_0_dequant_func = """
|
||||||
vec2 dequantize(uint ib, uint iqs) {
|
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
const float d = float(data_a[ib].d);
|
const float d = float(data_a[a_offset + ib].d);
|
||||||
return vec2(int(data_a[ib].qs[iqs]), int(data_a[ib].qs[iqs + 1])) * d;
|
return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d;
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -1244,12 +1244,12 @@ void main() {
|
||||||
tmp[tid] = FLOAT_TYPE(0.0f);
|
tmp[tid] = FLOAT_TYPE(0.0f);
|
||||||
|
|
||||||
[[unroll]] for (uint i = 0; i < p.ncols/BLOCK_SIZE; i += 2) {
|
[[unroll]] for (uint i = 0; i < p.ncols/BLOCK_SIZE; i += 2) {
|
||||||
const uint col = a_offset + i*BLOCK_SIZE + 2*tid;
|
const uint col = i*BLOCK_SIZE + 2*tid;
|
||||||
const uint ib = (row*p.ncols + col)/QUANT_K; // block index
|
const uint ib = (row*p.ncols + col)/QUANT_K; // block index
|
||||||
const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
|
const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
|
||||||
const uint iybs = col - col%QUANT_K; // y block start index
|
const uint iybs = col - col%QUANT_K; // y block start index
|
||||||
|
|
||||||
vec2 v = dequantize(ib, iqs);
|
vec2 v = dequantize(ib, iqs, a_offset / QUANT_K);
|
||||||
|
|
||||||
// matrix multiplication
|
// matrix multiplication
|
||||||
tmp[tid] += FLOAT_TYPE(v.x) * FLOAT_TYPE(data_b[b_offset + iybs + iqs]) +
|
tmp[tid] += FLOAT_TYPE(v.x) * FLOAT_TYPE(data_b[b_offset + iybs + iqs]) +
|
||||||
|
@ -1568,15 +1568,15 @@ void main() {
|
||||||
const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 66] >> 4);
|
const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 66] >> 4);
|
||||||
const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 67] >> 4);
|
const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 67] >> 4);
|
||||||
|
|
||||||
const FLOAT_TYPE sx = FLOAT_TYPE(data_b[b_offset + y1_idx] * q4_0 + data_b[b_offset + y1_idx + 1] * q4_1 + data_b[b_offset + y1_idx + 2] * q4_2 + data_b[b_offset + y1_idx + 3] * q4_3);
|
const FLOAT_TYPE sx = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx]) * q4_0 + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * q4_1 + FLOAT_TYPE(data_b[b_offset + y1_idx + 2]) * q4_2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 3]) * q4_3);
|
||||||
const FLOAT_TYPE sy = FLOAT_TYPE(data_b[b_offset + y1_idx + 32] * q4_4 + data_b[b_offset + y1_idx + 33] * q4_5 + data_b[b_offset + y1_idx + 34] * q4_6 + data_b[b_offset + y1_idx + 35] * q4_7);
|
const FLOAT_TYPE sy = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * q4_4 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * q4_5 + FLOAT_TYPE(data_b[b_offset + y1_idx + 34]) * q4_6 + FLOAT_TYPE(data_b[b_offset + y1_idx + 35]) * q4_7);
|
||||||
const FLOAT_TYPE sz = FLOAT_TYPE(data_b[b_offset + y2_idx] * q4_8 + data_b[b_offset + y2_idx + 1] * q4_9 + data_b[b_offset + y2_idx + 2] * q4_10 + data_b[b_offset + y2_idx + 3] * q4_11);
|
const FLOAT_TYPE sz = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx]) * q4_8 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * q4_9 + FLOAT_TYPE(data_b[b_offset + y2_idx + 2]) * q4_10 + FLOAT_TYPE(data_b[b_offset + y2_idx + 3]) * q4_11);
|
||||||
const FLOAT_TYPE sw = FLOAT_TYPE(data_b[b_offset + y2_idx + 32] * q4_12 + data_b[b_offset + y2_idx + 33] * q4_13 + data_b[b_offset + y2_idx + 34] * q4_14 + data_b[b_offset + y2_idx + 35] * q4_15);
|
const FLOAT_TYPE sw = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * q4_12 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * q4_13 + FLOAT_TYPE(data_b[b_offset + y2_idx + 34]) * q4_14 + FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * q4_15);
|
||||||
const FLOAT_TYPE smin = FLOAT_TYPE(
|
const FLOAT_TYPE smin = FLOAT_TYPE(
|
||||||
data_b[b_offset + y1_idx ] * sc2 + data_b[b_offset + y1_idx + 32] * sc3 + data_b[b_offset + y2_idx ] * sc6 + data_b[b_offset + y2_idx + 32] * sc7
|
FLOAT_TYPE(data_b[b_offset + y1_idx ]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx ]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * sc7
|
||||||
+ data_b[b_offset + y1_idx + 1] * sc2 + data_b[b_offset + y1_idx + 33] * sc3 + data_b[b_offset + y2_idx + 1] * sc6 + data_b[b_offset + y2_idx + 33] * sc7
|
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * sc7
|
||||||
+ data_b[b_offset + y1_idx + 2] * sc2 + data_b[b_offset + y1_idx + 34] * sc3 + data_b[b_offset + y2_idx + 2] * sc6 + data_b[b_offset + y2_idx + 34] * sc7
|
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 2]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 34]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 2]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 34]) * sc7
|
||||||
+ data_b[b_offset + y1_idx + 3] * sc2 + data_b[b_offset + y1_idx + 35] * sc3 + data_b[b_offset + y2_idx + 3] * sc6 + data_b[b_offset + y2_idx + 35] * sc7
|
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 3]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 35]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 3]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * sc7
|
||||||
);
|
);
|
||||||
tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * sc0 + sy * sc1 + sz * sc4 + sw * sc5) - dmin * smin);
|
tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * sc0 + sy * sc1 + sz * sc4 + sw * sc5) - dmin * smin);
|
||||||
#else
|
#else
|
||||||
|
@ -1589,13 +1589,13 @@ void main() {
|
||||||
const uint8_t q4_6 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] >> 4);
|
const uint8_t q4_6 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] >> 4);
|
||||||
const uint8_t q4_7 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] >> 4);
|
const uint8_t q4_7 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] >> 4);
|
||||||
|
|
||||||
const FLOAT_TYPE sx = FLOAT_TYPE(data_b[b_offset + y1_idx ] * q4_0 + data_b[b_offset + y1_idx + 1] * q4_1);
|
const FLOAT_TYPE sx = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx ]) * q4_0 + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * q4_1);
|
||||||
const FLOAT_TYPE sy = FLOAT_TYPE(data_b[b_offset + y1_idx + 32] * q4_2 + data_b[b_offset + y1_idx + 33] * q4_3);
|
const FLOAT_TYPE sy = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * q4_2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * q4_3);
|
||||||
const FLOAT_TYPE sz = FLOAT_TYPE(data_b[b_offset + y2_idx ] * q4_4 + data_b[b_offset + y2_idx + 1] * q4_5);
|
const FLOAT_TYPE sz = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx ]) * q4_4 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * q4_5);
|
||||||
const FLOAT_TYPE sw = FLOAT_TYPE(data_b[b_offset + y2_idx + 32] * q4_6 + data_b[b_offset + y2_idx + 33] * q4_7);
|
const FLOAT_TYPE sw = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * q4_6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * q4_7);
|
||||||
const FLOAT_TYPE smin = FLOAT_TYPE(
|
const FLOAT_TYPE smin = FLOAT_TYPE(
|
||||||
data_b[b_offset + y1_idx] * sc2 + data_b[b_offset + y1_idx + 32] * sc3 + data_b[b_offset + y2_idx] * sc6 + data_b[b_offset + y2_idx + 32] * sc7
|
FLOAT_TYPE(data_b[b_offset + y1_idx]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * sc7
|
||||||
+ data_b[b_offset + y1_idx + 1] * sc2 + data_b[b_offset + y1_idx + 33] * sc3 + data_b[b_offset + y2_idx + 1] * sc6 + data_b[b_offset + y2_idx + 33] * sc7
|
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * sc7
|
||||||
);
|
);
|
||||||
|
|
||||||
tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * FLOAT_TYPE(data_a[ib0 + i].scales[v_im] & 0x3f) + sy * FLOAT_TYPE(data_a[ib0 + i].scales[v_im + 1] & 0x3f) + sz * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 4] & 0x0f) | ((data_a[ib0 + i].scales[v_im] & 0xc0) >> 2)) + sw * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 5] & 0x0f) | ((data_a[ib0 + i].scales[v_im + 1] & 0xc0) >> 2))) - dmin * smin);
|
tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * FLOAT_TYPE(data_a[ib0 + i].scales[v_im] & 0x3f) + sy * FLOAT_TYPE(data_a[ib0 + i].scales[v_im + 1] & 0x3f) + sz * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 4] & 0x0f) | ((data_a[ib0 + i].scales[v_im] & 0xc0) >> 2)) + sw * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 5] & 0x0f) | ((data_a[ib0 + i].scales[v_im + 1] & 0xc0) >> 2))) - dmin * smin);
|
||||||
|
@ -1713,32 +1713,32 @@ void main() {
|
||||||
const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 81] >> 4);
|
const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 81] >> 4);
|
||||||
|
|
||||||
const FLOAT_TYPE sx = FLOAT_TYPE(
|
const FLOAT_TYPE sx = FLOAT_TYPE(
|
||||||
data_b[b_offset + y1_idx ] * (q4_0 + (((data_a[ib0 + i].qh[l0 ] & hm1) != 0) ? 16 : 0))
|
FLOAT_TYPE(data_b[b_offset + y1_idx ]) * (q4_0 + (((data_a[ib0 + i].qh[l0 ] & hm1) != 0) ? 16 : 0))
|
||||||
+ data_b[b_offset + y1_idx + 1] * (q4_1 + (((data_a[ib0 + i].qh[l0 + 1] & hm1) != 0) ? 16 : 0))
|
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * (q4_1 + (((data_a[ib0 + i].qh[l0 + 1] & hm1) != 0) ? 16 : 0))
|
||||||
+ data_b[b_offset + y1_idx + 16] * (q4_2 + (((data_a[ib0 + i].qh[l0 + 16] & hm1) != 0) ? 16 : 0))
|
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 16]) * (q4_2 + (((data_a[ib0 + i].qh[l0 + 16] & hm1) != 0) ? 16 : 0))
|
||||||
+ data_b[b_offset + y1_idx + 17] * (q4_3 + (((data_a[ib0 + i].qh[l0 + 17] & hm1) != 0) ? 16 : 0))
|
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 17]) * (q4_3 + (((data_a[ib0 + i].qh[l0 + 17] & hm1) != 0) ? 16 : 0))
|
||||||
);
|
);
|
||||||
const FLOAT_TYPE sy = FLOAT_TYPE(
|
const FLOAT_TYPE sy = FLOAT_TYPE(
|
||||||
data_b[b_offset + y1_idx + 32] * (q4_4 + (((data_a[ib0 + i].qh[l0 ] & (hm1 << 1)) != 0) ? 16 : 0))
|
FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * (q4_4 + (((data_a[ib0 + i].qh[l0 ] & (hm1 << 1)) != 0) ? 16 : 0))
|
||||||
+ data_b[b_offset + y1_idx + 33] * (q4_5 + (((data_a[ib0 + i].qh[l0 + 1] & (hm1 << 1)) != 0) ? 16 : 0))
|
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * (q4_5 + (((data_a[ib0 + i].qh[l0 + 1] & (hm1 << 1)) != 0) ? 16 : 0))
|
||||||
+ data_b[b_offset + y1_idx + 48] * (q4_6 + (((data_a[ib0 + i].qh[l0 + 16] & (hm1 << 1)) != 0) ? 16 : 0))
|
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 48]) * (q4_6 + (((data_a[ib0 + i].qh[l0 + 16] & (hm1 << 1)) != 0) ? 16 : 0))
|
||||||
+ data_b[b_offset + y1_idx + 49] * (q4_7 + (((data_a[ib0 + i].qh[l0 + 17] & (hm1 << 1)) != 0) ? 16 : 0))
|
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 49]) * (q4_7 + (((data_a[ib0 + i].qh[l0 + 17] & (hm1 << 1)) != 0) ? 16 : 0))
|
||||||
);
|
);
|
||||||
const FLOAT_TYPE sz = FLOAT_TYPE(
|
const FLOAT_TYPE sz = FLOAT_TYPE(
|
||||||
data_b[b_offset + y2_idx ] * (q4_8 + (((data_a[ib0 + i].qh[l0 ] & hm2) != 0) ? 16 : 0))
|
FLOAT_TYPE(data_b[b_offset + y2_idx ]) * (q4_8 + (((data_a[ib0 + i].qh[l0 ] & hm2) != 0) ? 16 : 0))
|
||||||
+ data_b[b_offset + y2_idx + 1] * (q4_9 + (((data_a[ib0 + i].qh[l0 + 1] & hm2) != 0) ? 16 : 0))
|
+ FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * (q4_9 + (((data_a[ib0 + i].qh[l0 + 1] & hm2) != 0) ? 16 : 0))
|
||||||
+ data_b[b_offset + y2_idx + 16] * (q4_10 + (((data_a[ib0 + i].qh[l0 + 16] & hm2) != 0) ? 16 : 0))
|
+ FLOAT_TYPE(data_b[b_offset + y2_idx + 16]) * (q4_10 + (((data_a[ib0 + i].qh[l0 + 16] & hm2) != 0) ? 16 : 0))
|
||||||
+ data_b[b_offset + y2_idx + 17] * (q4_11 + (((data_a[ib0 + i].qh[l0 + 17] & hm2) != 0) ? 16 : 0))
|
+ FLOAT_TYPE(data_b[b_offset + y2_idx + 17]) * (q4_11 + (((data_a[ib0 + i].qh[l0 + 17] & hm2) != 0) ? 16 : 0))
|
||||||
);
|
);
|
||||||
const FLOAT_TYPE sw = FLOAT_TYPE(
|
const FLOAT_TYPE sw = FLOAT_TYPE(
|
||||||
data_b[b_offset + y2_idx + 32] * (q4_12 + (((data_a[ib0 + i].qh[l0 ] & (hm2 << 1)) != 0) ? 16 : 0))
|
FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * (q4_12 + (((data_a[ib0 + i].qh[l0 ] & (hm2 << 1)) != 0) ? 16 : 0))
|
||||||
+ data_b[b_offset + y2_idx + 33] * (q4_13 + (((data_a[ib0 + i].qh[l0 + 1] & (hm2 << 1)) != 0) ? 16 : 0))
|
+ FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * (q4_13 + (((data_a[ib0 + i].qh[l0 + 1] & (hm2 << 1)) != 0) ? 16 : 0))
|
||||||
+ data_b[b_offset + y2_idx + 48] * (q4_14 + (((data_a[ib0 + i].qh[l0 + 16] & (hm2 << 1)) != 0) ? 16 : 0))
|
+ FLOAT_TYPE(data_b[b_offset + y2_idx + 48]) * (q4_14 + (((data_a[ib0 + i].qh[l0 + 16] & (hm2 << 1)) != 0) ? 16 : 0))
|
||||||
+ data_b[b_offset + y2_idx + 49] * (q4_15 + (((data_a[ib0 + i].qh[l0 + 17] & (hm2 << 1)) != 0) ? 16 : 0))
|
+ FLOAT_TYPE(data_b[b_offset + y2_idx + 49]) * (q4_15 + (((data_a[ib0 + i].qh[l0 + 17] & (hm2 << 1)) != 0) ? 16 : 0))
|
||||||
);
|
);
|
||||||
const FLOAT_TYPE smin = FLOAT_TYPE(
|
const FLOAT_TYPE smin = FLOAT_TYPE(
|
||||||
(data_b[b_offset + y1_idx] + data_b[b_offset + y1_idx + 1] + data_b[b_offset + y1_idx + 16] + data_b[b_offset + y1_idx + 17]) * sc2 + (data_b[b_offset + y1_idx + 32] + data_b[b_offset + y1_idx + 33] + data_b[b_offset + y1_idx + 48] + data_b[b_offset + y1_idx + 49]) * sc3
|
(FLOAT_TYPE(data_b[b_offset + y1_idx]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 17])) * sc2 + (FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 49])) * sc3
|
||||||
+ (data_b[b_offset + y2_idx] + data_b[b_offset + y2_idx + 1] + data_b[b_offset + y2_idx + 16] + data_b[b_offset + y2_idx + 17]) * sc6 + (data_b[b_offset + y2_idx + 32] + data_b[b_offset + y2_idx + 33] + data_b[b_offset + y2_idx + 48] + data_b[b_offset + y2_idx + 49]) * sc7
|
+ (FLOAT_TYPE(data_b[b_offset + y2_idx]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 17])) * sc6 + (FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 49])) * sc7
|
||||||
);
|
);
|
||||||
tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * sc0 + sy * sc1 + sz * sc4 + sw * sc5) - dmin * smin);
|
tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * sc0 + sy * sc1 + sz * sc4 + sw * sc5) - dmin * smin);
|
||||||
}
|
}
|
||||||
|
@ -2154,7 +2154,7 @@ add_body = """
|
||||||
|
|
||||||
# SCALE
|
# SCALE
|
||||||
scale_body = """
|
scale_body = """
|
||||||
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) + FLOAT_TYPE(p.param1));
|
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(p.param1));
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -2229,7 +2229,7 @@ void main() {
|
||||||
const uint iybs = i00 - i00%QUANT_K; // dst block start index
|
const uint iybs = i00 - i00%QUANT_K; // dst block start index
|
||||||
const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
|
const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
|
||||||
|
|
||||||
vec2 v = dequantize(ib, iqs);
|
vec2 v = dequantize(ib, iqs, 0);
|
||||||
|
|
||||||
data_d[d_offset + iybs + iqs ] = D_TYPE(v.x);
|
data_d[d_offset + iybs + iqs ] = D_TYPE(v.x);
|
||||||
data_d[d_offset + iybs + iqs + y_offset] = D_TYPE(v.y);
|
data_d[d_offset + iybs + iqs + y_offset] = D_TYPE(v.y);
|
||||||
|
@ -2328,7 +2328,11 @@ void main() {
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint i = row*p.ncols + col;
|
const uint i = row*p.ncols + col;
|
||||||
data_d[i] = D_TYPE(data_a[i] - float(uint(col > p.n_past + row % p.rows_per_channel) * 0xFFFFFFFF));
|
if (col > p.n_past + row % p.rows_per_channel) {
|
||||||
|
data_d[i] = D_TYPE(uintBitsToFloat(0xFF800000));
|
||||||
|
} else {
|
||||||
|
data_d[i] = D_TYPE(data_a[i]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -2348,8 +2352,6 @@ void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x;
|
||||||
const uint tid = gl_LocalInvocationID.x;
|
const uint tid = gl_LocalInvocationID.x;
|
||||||
|
|
||||||
const float eps = 1e-5f;
|
|
||||||
|
|
||||||
sum[tid] = vec2(0.0f, 0.0f);
|
sum[tid] = vec2(0.0f, 0.0f);
|
||||||
|
|
||||||
[[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
|
[[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
|
||||||
|
@ -2369,7 +2371,7 @@ void main() {
|
||||||
|
|
||||||
const float mean = sum[0].x / p.KX;
|
const float mean = sum[0].x / p.KX;
|
||||||
const float var = sum[0].y / p.KX - mean * mean;
|
const float var = sum[0].y / p.KX - mean * mean;
|
||||||
const float inv_std = inversesqrt(var + 1e-5f);
|
const float inv_std = inversesqrt(var + p.param1);
|
||||||
|
|
||||||
[[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
|
[[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
|
||||||
data_d[row*p.KX + col] = D_TYPE((float(data_a[row*p.KX + col]) - mean) * inv_std);
|
data_d[row*p.KX + col] = D_TYPE((float(data_a[row*p.KX + col]) - mean) * inv_std);
|
||||||
|
@ -2840,13 +2842,13 @@ async def main():
|
||||||
stream.clear()
|
stream.clear()
|
||||||
stream.extend((mulmat_head, shader_float_type, mulmat_body1, mulmat_load_scalar, mulmat_body2))
|
stream.extend((mulmat_head, shader_float_type, mulmat_body1, mulmat_load_scalar, mulmat_body2))
|
||||||
tasks.append(string_to_spv("matmul_f32", "".join(stream), {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
|
tasks.append(string_to_spv("matmul_f32", "".join(stream), {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
|
||||||
tasks.append(string_to_spv("matmul_f32_aligned", "".join(stream), {"LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "A_TYPE": vec_type, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
|
tasks.append(string_to_spv("matmul_f32_aligned", "".join(stream), {"LOAD_VEC_A": 1, "LOAD_VEC_B": load_vec, "A_TYPE": "float", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
|
||||||
|
|
||||||
tasks.append(string_to_spv("matmul_f16", "".join(stream), {"A_TYPE": "float16_t", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
|
tasks.append(string_to_spv("matmul_f16", "".join(stream), {"A_TYPE": "float16_t", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
|
||||||
tasks.append(string_to_spv("matmul_f16_aligned", "".join(stream), {"LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type_f16, "D_TYPE": "float"}, fp16))
|
tasks.append(string_to_spv("matmul_f16_aligned", "".join(stream), {"LOAD_VEC_A": 1, "LOAD_VEC_B": load_vec, "A_TYPE": "float16_t", "B_TYPE": vec_type_f16, "D_TYPE": "float"}, fp16))
|
||||||
|
|
||||||
tasks.append(string_to_spv("matmul_f16_f32", "".join(stream), {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
|
tasks.append(string_to_spv("matmul_f16_f32", "".join(stream), {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
|
||||||
tasks.append(string_to_spv("matmul_f16_f32_aligned", "".join(stream), {"LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
|
tasks.append(string_to_spv("matmul_f16_f32_aligned", "".join(stream), {"LOAD_VEC_A": 1, "LOAD_VEC_B": load_vec, "A_TYPE": "float16_t", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
|
||||||
|
|
||||||
stream.clear()
|
stream.clear()
|
||||||
stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q4_0_defines, mulmat_body1, mulmat_load_q4_0, mulmat_body2))
|
stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q4_0_defines, mulmat_body1, mulmat_load_q4_0, mulmat_body2))
|
||||||
|
@ -2992,7 +2994,9 @@ async def main():
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
tasks.append(string_to_spv(f"mul_mat_vec_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float", "K_QUANTS_PER_ITERATION": K_QUANTS_PER_ITERATION}))
|
tasks.append(string_to_spv(f"mul_mat_vec_{type_names[i]}_f32_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float", "K_QUANTS_PER_ITERATION": K_QUANTS_PER_ITERATION}))
|
||||||
|
tasks.append(string_to_spv(f"mul_mat_vec_{type_names[i]}_f16_f32", "".join(stream), {"B_TYPE": "float16_t", "D_TYPE": "float", "K_QUANTS_PER_ITERATION": K_QUANTS_PER_ITERATION}))
|
||||||
|
|
||||||
tasks.append(string_to_spv(f"mul_mat_vec_id_{type_names[i]}_f32", "".join(stream), {"MUL_MAT_ID": "1", "B_TYPE": "float", "D_TYPE": "float", "K_QUANTS_PER_ITERATION": K_QUANTS_PER_ITERATION}))
|
tasks.append(string_to_spv(f"mul_mat_vec_id_{type_names[i]}_f32", "".join(stream), {"MUL_MAT_ID": "1", "B_TYPE": "float", "D_TYPE": "float", "K_QUANTS_PER_ITERATION": K_QUANTS_PER_ITERATION}))
|
||||||
|
|
||||||
# Dequant shaders
|
# Dequant shaders
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue