Fix Vulkan GGML_OP_GET_ROWS implementation
This commit is contained in:
parent
1fceeb9046
commit
d00b11b0b5
3 changed files with 4732 additions and 2869 deletions
File diff suppressed because it is too large
Load diff
132
ggml-vulkan.cpp
132
ggml-vulkan.cpp
|
@ -1213,19 +1213,21 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
||||
|
||||
// get_rows
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
|
||||
|
||||
|
@ -3238,7 +3240,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|||
}
|
||||
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
|
||||
#endif
|
||||
GGML_ASSERT(!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type))); // NOLINT
|
||||
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
||||
GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
||||
GGML_ASSERT(dst->extra != nullptr);
|
||||
const uint64_t ne00 = src0->ne[0];
|
||||
|
@ -3306,7 +3308,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|||
}
|
||||
}
|
||||
|
||||
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
||||
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
||||
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
||||
uint64_t z_sz = use_src2 ? ggml_vk_align_size(ggml_type_size(src2->type) * ne2, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
||||
uint64_t d_sz = ggml_type_size(dst->type) * ne0;
|
||||
|
@ -3338,14 +3340,18 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|||
GGML_ASSERT(d_Z != nullptr);
|
||||
}
|
||||
|
||||
if (op == GGML_OP_CPY) {
|
||||
if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS) {
|
||||
x_sz = ggml_nbytes(src0);
|
||||
y_sz = use_src1 ? ggml_nbytes(src1) : 0;
|
||||
d_sz = ggml_nbytes(dst);
|
||||
|
||||
if (extra_src0->offset + x_sz >= d_X->size) {
|
||||
if (x_buf_offset + x_sz >= d_X->size) {
|
||||
x_sz = VK_WHOLE_SIZE;
|
||||
}
|
||||
if (extra->offset + d_sz >= d_D->size) {
|
||||
if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
|
||||
y_sz = VK_WHOLE_SIZE;
|
||||
}
|
||||
if (d_buf_offset + d_sz >= d_D->size) {
|
||||
d_sz = VK_WHOLE_SIZE;
|
||||
}
|
||||
}
|
||||
|
@ -3353,7 +3359,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|||
std::array<uint32_t, 3> elements;
|
||||
|
||||
// Single call if dimension 2 is contiguous
|
||||
if (op == GGML_OP_CPY || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
|
||||
if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1);
|
||||
|
||||
switch (dst->op) {
|
||||
|
@ -3366,16 +3372,19 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|||
case GGML_OP_ROPE:
|
||||
elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
|
||||
break;
|
||||
case GGML_OP_GET_ROWS:
|
||||
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
||||
break;
|
||||
default:
|
||||
elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
|
||||
break;
|
||||
}
|
||||
|
||||
if (op != GGML_OP_CPY) {
|
||||
if (op != GGML_OP_CPY && op != GGML_OP_GET_ROWS) {
|
||||
if (x_sz != VK_WHOLE_SIZE) {
|
||||
x_sz *= ne02 * ne03;
|
||||
}
|
||||
if (y_sz != VK_WHOLE_SIZE) {
|
||||
if (use_src1 && y_sz != VK_WHOLE_SIZE) {
|
||||
y_sz *= ne12 * ne13;
|
||||
}
|
||||
if (d_sz != VK_WHOLE_SIZE) {
|
||||
|
@ -3430,6 +3439,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|||
case GGML_OP_ROPE:
|
||||
elements = { (uint32_t)ne01, (uint32_t)ne00, 1 };
|
||||
break;
|
||||
case GGML_OP_GET_ROWS:
|
||||
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
||||
break;
|
||||
default:
|
||||
elements = { (uint32_t)ne0, 1, 1 };
|
||||
break;
|
||||
|
@ -3464,7 +3476,18 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|||
}
|
||||
|
||||
static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
0,
|
||||
0.0f, 0.0f,
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
|
@ -4739,7 +4762,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
}
|
||||
break;
|
||||
case GGML_OP_REPEAT:
|
||||
// case GGML_OP_GET_ROWS:
|
||||
case GGML_OP_GET_ROWS:
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_SCALE:
|
||||
|
@ -5561,8 +5584,25 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|||
}
|
||||
break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
// case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
struct ggml_tensor * a;
|
||||
struct ggml_tensor * b;
|
||||
if (op->op == GGML_OP_MUL_MAT) {
|
||||
|
@ -5577,25 +5617,26 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|||
}
|
||||
return true;
|
||||
} break;
|
||||
// case GGML_OP_GET_ROWS:
|
||||
// {
|
||||
// switch (op->src[0]->type) {
|
||||
// case GGML_TYPE_F16:
|
||||
// case GGML_TYPE_F32:
|
||||
// case GGML_TYPE_Q4_0:
|
||||
// case GGML_TYPE_Q4_1:
|
||||
// case GGML_TYPE_Q5_0:
|
||||
// case GGML_TYPE_Q5_1:
|
||||
// case GGML_TYPE_Q8_0:
|
||||
// return true;
|
||||
// default:
|
||||
// return false;
|
||||
// }
|
||||
// } break;
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_DUP:
|
||||
{
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
ggml_type src1_type = op->src[1]->type;
|
||||
ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type;
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
|
@ -5607,7 +5648,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|||
}
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_DUP:
|
||||
// case GGML_OP_REPEAT:
|
||||
// {
|
||||
// ggml_type src0_type = op->src[0]->type;
|
||||
|
@ -5752,10 +5792,12 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, vo
|
|||
extern "C" GGML_CALL int ggml_backend_vk_reg_devices();
|
||||
|
||||
GGML_CALL int ggml_backend_vk_reg_devices() {
|
||||
for (auto idx : vk_instance.device_indices) {
|
||||
ggml_vk_instance_init();
|
||||
|
||||
for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
|
||||
char name[128];
|
||||
snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, idx);
|
||||
ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(idx), (void *) (intptr_t) idx); // NOLINT
|
||||
snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, i);
|
||||
ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(i), (void *) (intptr_t) i); // NOLINT
|
||||
}
|
||||
return vk_instance.device_indices.size();
|
||||
}
|
||||
|
@ -5839,6 +5881,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
|||
val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
||||
} else if (tensor->type == GGML_TYPE_F16) {
|
||||
val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
fprintf(stderr, "% 7.2f ", val);
|
||||
} else {
|
||||
|
@ -6196,6 +6240,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|||
tensor_clone = ggml_permute(ggml_ctx, src0_clone, params[0], params[1], params[2], params[3]);
|
||||
} else if (tensor->op == GGML_OP_TRANSPOSE) {
|
||||
tensor_clone = ggml_transpose(ggml_ctx, src0_clone);
|
||||
} else if (tensor->op == GGML_OP_GET_ROWS) {
|
||||
tensor_clone = ggml_get_rows(ggml_ctx, src0_clone, src1_clone);
|
||||
} else {
|
||||
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
||||
GGML_ASSERT(false);
|
||||
|
|
|
@ -18,6 +18,12 @@ shader_int8_ext = """
|
|||
"""
|
||||
|
||||
# Type-specific defines
|
||||
shader_f32_defines = """
|
||||
#define QUANT_K 1
|
||||
#define QUANT_R 1
|
||||
|
||||
#define A_TYPE float
|
||||
"""
|
||||
shader_f16_defines = """
|
||||
#define QUANT_K 1
|
||||
#define QUANT_R 1
|
||||
|
@ -157,8 +163,8 @@ struct block_q6_K
|
|||
"""
|
||||
|
||||
# Dequant functions
|
||||
shader_f16_dequant_func = """
|
||||
#define DEQUANT_FUNC vec2 v = vec2(data_a[ib + 0], data_a[ib + 1]);
|
||||
shader_float_dequant_func = """
|
||||
#define DEQUANT_FUNC vec2 v = vec2(ib, ib); // data_a[ib], data_a[ib + 1]);
|
||||
"""
|
||||
|
||||
shader_q4_0_dequant_func = """
|
||||
|
@ -1738,8 +1744,9 @@ layout (push_constant) uniform parameter
|
|||
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
||||
uint d_offset;
|
||||
float param1; float param2;
|
||||
} p;
|
||||
} p;"""
|
||||
|
||||
generic_unary_op_funcs = """
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
|
@ -1763,14 +1770,17 @@ uint dst_idx(uint idx) {
|
|||
const uint i11 = (idx - i13_offset - i12_offset) / p.ne10;
|
||||
const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
|
||||
return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
|
||||
}
|
||||
}"""
|
||||
|
||||
generic_unary_op_main = """
|
||||
void main() {
|
||||
if (gl_GlobalInvocationID.x >= p.ne) {
|
||||
return;
|
||||
}
|
||||
"""
|
||||
|
||||
generic_unary_op_combined = f"{generic_unary_op_head}\n{generic_unary_op_funcs}\n{generic_unary_op_main}"
|
||||
|
||||
generic_binary_op_head = """#version 450
|
||||
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
@ -1782,13 +1792,14 @@ layout (push_constant) uniform parameter
|
|||
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
||||
uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
|
||||
uint d_offset;
|
||||
uint param1; uint param2;
|
||||
} p;
|
||||
float param1; float param2;
|
||||
} p;"""
|
||||
|
||||
generic_binary_op_funcs = """
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) readonly buffer B {A_TYPE data_b[];};
|
||||
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
uint src0_idx(uint idx) {
|
||||
|
@ -1820,14 +1831,17 @@ uint dst_idx(uint idx) {
|
|||
const uint i21 = (idx - i23_offset - i22_offset) / p.ne20;
|
||||
const uint i20 = idx - i23_offset - i22_offset - i21*p.ne20;
|
||||
return i23*p.nb23 + i22*p.nb22 + i21*p.nb21 + i20*p.nb20;
|
||||
}
|
||||
}"""
|
||||
|
||||
generic_binary_op_main = """
|
||||
void main() {
|
||||
if (gl_GlobalInvocationID.x >= p.ne) {
|
||||
return;
|
||||
}
|
||||
"""
|
||||
|
||||
generic_binary_op_combined = f"{generic_binary_op_head}\n{generic_binary_op_funcs}\n{generic_binary_op_main}"
|
||||
|
||||
# MUL F32
|
||||
mul_body = """
|
||||
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
|
||||
|
@ -1872,39 +1886,55 @@ cpy_f16_f16_end = """
|
|||
"""
|
||||
|
||||
# GET_ROWS
|
||||
get_rows_body = """
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) readonly buffer Y {int data_b[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
|
||||
|
||||
get_rows_float_body = """
|
||||
void main() {
|
||||
const uint col = int(gl_GlobalInvocationID.x) * 2;
|
||||
const uint row = int(gl_GlobalInvocationID.y);
|
||||
const uint i00 = gl_GlobalInvocationID.x;
|
||||
const uint i10 = gl_GlobalInvocationID.y;
|
||||
const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
|
||||
const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
|
||||
|
||||
if (col >= p.KY) {
|
||||
if (i00 >= p.ne00) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint r = uint(data_b[row]);
|
||||
const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
|
||||
|
||||
// copy data_a[r*p.KY + col] to dst[row*p.KX + col]
|
||||
const uint xi = r*p.KY + col;
|
||||
const uint di = row*p.KY + col;
|
||||
const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
|
||||
const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
|
||||
|
||||
const uint ib = xi/QUANT_K; // block index
|
||||
const uint iqs = (xi%QUANT_K)/QUANT_R; // quant index
|
||||
const uint iybs = di - di%QUANT_K; // y block start index
|
||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||
data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);
|
||||
#else
|
||||
data_d[d_offset + i00] = data_a[a_offset + i00];
|
||||
#endif
|
||||
}
|
||||
"""
|
||||
|
||||
get_rows_body = """
|
||||
void main() {
|
||||
const uint i00 = (gl_GlobalInvocationID.x)*2;
|
||||
const uint i10 = gl_GlobalInvocationID.y;
|
||||
const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
|
||||
const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
|
||||
|
||||
if (i00 >= p.ne00) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
|
||||
|
||||
const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
|
||||
const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
|
||||
|
||||
const uint ib = a_offset + i00/QUANT_K; // block index
|
||||
const uint iqs = (i00%QUANT_K)/QUANT_R; // quant index
|
||||
const uint iybs = i00 - i00%QUANT_K; // dst block start index
|
||||
const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
|
||||
|
||||
DEQUANT_FUNC
|
||||
|
||||
dst[iybs + iqs + 0] = D_TYPE(v.x);
|
||||
dst[iybs + iqs + y_offset] = D_TYPE(v.y);
|
||||
data_d[d_offset + iybs + iqs ] = D_TYPE(v.x);
|
||||
data_d[d_offset + iybs + iqs + y_offset] = D_TYPE(v.y);
|
||||
}
|
||||
"""
|
||||
|
||||
|
@ -2578,7 +2608,7 @@ async def main():
|
|||
stream.extend((mul_mat_vec_head, shader_int8_ext, shader_f32))
|
||||
|
||||
if i == GGML_TYPE_F16:
|
||||
stream.extend((shader_f16_defines, shader_f16_dequant_func, mul_mat_vec_body))
|
||||
stream.extend((shader_f16_defines, shader_float_dequant_func, mul_mat_vec_body))
|
||||
elif i == GGML_TYPE_Q4_0:
|
||||
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, mul_mat_vec_body))
|
||||
elif i == GGML_TYPE_Q4_1:
|
||||
|
@ -2640,25 +2670,32 @@ async def main():
|
|||
# get_rows
|
||||
for i in range(0, VK_NUM_TYPES):
|
||||
stream.clear()
|
||||
stream.extend((generic_head, shader_int8_ext, shader_f32))
|
||||
stream.extend((generic_binary_op_head, shader_int8_ext, shader_f32))
|
||||
optimization_workaround = False
|
||||
|
||||
if i == GGML_TYPE_F16:
|
||||
stream.extend((shader_f16_defines, shader_f16_dequant_func, get_rows_body))
|
||||
if i == GGML_TYPE_F32:
|
||||
stream.extend((shader_f32_defines, generic_binary_op_funcs, get_rows_float_body))
|
||||
elif i == GGML_TYPE_F16:
|
||||
stream.extend((shader_f16_defines, generic_binary_op_funcs, get_rows_float_body))
|
||||
optimization_workaround = True
|
||||
elif i == GGML_TYPE_Q4_0:
|
||||
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, get_rows_body))
|
||||
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, generic_binary_op_funcs, get_rows_body))
|
||||
elif i == GGML_TYPE_Q4_1:
|
||||
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, get_rows_body))
|
||||
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, generic_binary_op_funcs, get_rows_body))
|
||||
elif i == GGML_TYPE_Q5_0:
|
||||
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, get_rows_body))
|
||||
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, generic_binary_op_funcs, get_rows_body))
|
||||
elif i == GGML_TYPE_Q5_1:
|
||||
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, get_rows_body))
|
||||
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, generic_binary_op_funcs, get_rows_body))
|
||||
elif i == GGML_TYPE_Q8_0:
|
||||
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, get_rows_body))
|
||||
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, generic_binary_op_funcs, get_rows_body))
|
||||
else:
|
||||
continue
|
||||
|
||||
tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float16_t"}))
|
||||
tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float"}))
|
||||
if optimization_workaround:
|
||||
tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "int", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"}))
|
||||
else:
|
||||
tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "int", "D_TYPE": "float16_t"}))
|
||||
tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "int", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", mul_mat_p021_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", mul_mat_nc_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
@ -2667,20 +2704,20 @@ async def main():
|
|||
tasks.append(string_to_spv("norm_f32", f"{generic_head}\n{shader_f32}\n{norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("rms_norm_f32", f"{generic_head}\n{shader_f32}\n{rms_norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("cpy_f32_f32", f"{generic_unary_op_head}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("cpy_f32_f16", f"{generic_unary_op_head}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
|
||||
tasks.append(string_to_spv("cpy_f16_f16", f"{generic_unary_op_head}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
|
||||
tasks.append(string_to_spv("cpy_f32_f32", f"{generic_unary_op_combined}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("cpy_f32_f16", f"{generic_unary_op_combined}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
|
||||
tasks.append(string_to_spv("cpy_f16_f16", f"{generic_unary_op_combined}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
|
||||
|
||||
tasks.append(string_to_spv("add_f32", f"{generic_binary_op_head}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("add_f32", f"{generic_binary_op_combined}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("split_k_reduce", mulmat_split_k_reduce_src, {}))
|
||||
tasks.append(string_to_spv("mul_f32", f"{generic_binary_op_head}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("mul_f32", f"{generic_binary_op_combined}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("scale_f32", f"{generic_unary_op_head}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("scale_f32", f"{generic_unary_op_combined}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("sqr_f32", f"{generic_unary_op_head}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("sqr_f32", f"{generic_unary_op_combined}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("clamp_f32", f"{generic_unary_op_head}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("clamp_f32", f"{generic_unary_op_combined}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("gelu_f32", f"{generic_head}\n{shader_f32}\n{gelu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("silu_f32", f"{generic_head}\n{shader_f32}\n{silu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue