ggml: Add POOL2D OP for GPU acceleration to the Vulkan backend in the MobileVLM model. (#9763)
* ggml: Add POOL2D OP for GPU ACC to the Vulkan. - The MobileVLM model now supports inference acceleration through GPU by utilizing the Vulkan backend. - A GGML_OP_POOL_2D shader has been added. (Pooling) - The encoding performance of the CLIP model improved from 2.8s on the CPU to 0.7s on the GPU. Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com> * [fix] Correct the incorrect order of the parameters. fix casting to int. Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com> --------- Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com>
This commit is contained in:
parent
8d8ff71536
commit
8f275a7c45
3 changed files with 150 additions and 0 deletions
|
@ -213,6 +213,7 @@ struct vk_device_struct {
|
|||
vk_pipeline pipeline_sum_rows_f32;
|
||||
vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
|
||||
vk_pipeline pipeline_timestep_embedding_f32;
|
||||
vk_pipeline pipeline_pool2d_f32;
|
||||
|
||||
std::unordered_map<std::string, vk_pipeline_ref> pipelines;
|
||||
std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
|
||||
|
@ -403,6 +404,17 @@ struct vk_op_timestep_embedding_push_constants {
|
|||
uint32_t max_period;
|
||||
};
|
||||
|
||||
struct vk_op_pool2d_push_constants {
|
||||
uint32_t IW; uint32_t IH;
|
||||
uint32_t OW; uint32_t OH;
|
||||
uint32_t OC;
|
||||
uint32_t pelements;
|
||||
uint32_t op;
|
||||
int32_t k0; int32_t k1;
|
||||
int32_t s0; int32_t s1;
|
||||
int32_t p0; int32_t p1;
|
||||
};
|
||||
|
||||
// Allow pre-recording command buffers
|
||||
struct vk_staging_memcpy {
|
||||
vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
|
||||
|
@ -1803,6 +1815,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
for (auto &c : compiles) {
|
||||
c.wait();
|
||||
}
|
||||
|
@ -4234,6 +4248,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|||
return ctx->device->pipeline_timestep_embedding_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_POOL_2D:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_pool2d_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_leaky_relu_f32;
|
||||
|
@ -4464,6 +4483,14 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|||
uint32_t half_ceil = (dim + 1) / 2;
|
||||
elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
|
||||
} break;
|
||||
case GGML_OP_POOL_2D:
|
||||
{
|
||||
const uint32_t N = dst->ne[3];
|
||||
const uint32_t OC = dst->ne[2];
|
||||
const uint32_t OH = dst->ne[1];
|
||||
const uint32_t OW = dst->ne[0];
|
||||
elements = { N * OC * OH * OW, 1, 1};
|
||||
} break;
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_MUL:
|
||||
|
@ -4914,6 +4941,34 @@ static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context
|
|||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
|
||||
const int32_t k1 = dst->op_params[1];
|
||||
const int32_t k0 = dst->op_params[2];
|
||||
const int32_t s1 = dst->op_params[3];
|
||||
const int32_t s0 = dst->op_params[4];
|
||||
const int32_t p1 = dst->op_params[5];
|
||||
const int32_t p0 = dst->op_params[6];
|
||||
|
||||
const uint32_t IH = src0->ne[1];
|
||||
const uint32_t IW = src0->ne[0];
|
||||
|
||||
const uint32_t N = dst->ne[3];
|
||||
|
||||
const uint32_t OC = dst->ne[2];
|
||||
const uint32_t OH = dst->ne[1];
|
||||
const uint32_t OW = dst->ne[0];
|
||||
|
||||
const uint32_t parallel_elements = N * OC * OH * OW;
|
||||
|
||||
ggml_vk_op_f32<vk_op_pool2d_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_POOL_2D, {
|
||||
IW, IH, OW, OH, OC,
|
||||
parallel_elements,
|
||||
op,
|
||||
k0, k1, s0, s1, p0, p1,
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
const float * op_params = (const float *)dst->op_params;
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }, dryrun);
|
||||
|
@ -5792,6 +5847,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
break;
|
||||
default:
|
||||
|
@ -5927,6 +5983,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_POOL_2D:
|
||||
ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
ggml_vk_leaky_relu(ctx, compute_ctx, src0, node, dryrun);
|
||||
|
@ -6018,6 +6078,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
|||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_REPEAT:
|
||||
buf = tensor->buffer;
|
||||
|
@ -6821,6 +6882,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
return true;
|
||||
default:
|
||||
|
@ -7334,6 +7396,16 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
|||
const int32_t dim = tensor->op_params[0];
|
||||
const int32_t max_period = tensor->op_params[1];
|
||||
tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period);
|
||||
} else if (tensor->op == GGML_OP_POOL_2D) {
|
||||
enum ggml_op_pool op = static_cast<ggml_op_pool>(dst->op_params[0]);
|
||||
const int32_t k0 = tensor->op_params[1];
|
||||
const int32_t k1 = tensor->op_params[2];
|
||||
const int32_t s0 = tensor->op_params[3];
|
||||
const int32_t s1 = tensor->op_params[4];
|
||||
const int32_t p0 = tensor->op_params[5];
|
||||
const int32_t p1 = tensor->op_params[6];
|
||||
|
||||
tensor_clone = ggml_pool_2d(ggml_ctx, src0_clone, op, k0, k1, s0, s1, p0, p1);
|
||||
} else if (tensor->op == GGML_OP_LEAKY_RELU) {
|
||||
const float * op_params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue