From 100ccd5e7fb5bafa92d57ea87108461f91bcfcc6 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 13 Jul 2024 00:06:58 +0800 Subject: [PATCH] add unary op template and more ops --- ggml/src/ggml-qnn.cpp | 37 ++--- ggml/src/ggml-qnn/backend-ops.cpp | 243 ++++++++++++++++++++++++------ ggml/src/ggml-qnn/backend-ops.hpp | 11 +- ggml/src/ggml-qnn/backend.hpp | 8 +- 4 files changed, 225 insertions(+), 74 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 3584c4112..de1fefe49 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -1,6 +1,5 @@ #include "ggml-qnn.h" -#include #include #include #include @@ -15,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -142,7 +142,8 @@ struct ggml_backend_qnn_buffer_type_context { // ================================================================================================= static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct ggml_tensor *tensor, bool b_dump_tensor_info) { - if (ggml_is_empty(tensor) || !qnn::ggml_qnn_op_array()[tensor->op]) { + if (ggml_is_empty(tensor) || + (!qnn::ggml_qnn_unary_op_array()[tensor->op] && !qnn::ggml_qnn_binary_op_array()[tensor->op])) { return false; } @@ -161,19 +162,6 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct g return false; } - // TODO: support other GGML OPs using QNN API - // a GENERAL approach could fix this problem in a standalone PR of refine ggml backend - // subsystem for hybrid inference between CPU&GPU / CPU&NPU easily(less the 100 LoC and no - // side-effect to the existing codes) for ANY ggml backends which the backend's - // ggml_backend_xxx_buffer_is_host return true. this approach could be found at: - // https://github.com/ggerganov/llama.cpp/pull/7641 - bool supported_op = false; - supported_op = (tensor->op == GGML_OP_ADD); - supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL_MAT)); - if (!supported_op) { - return false; - } - // TODO: support other quantized data type if (ggml_is_quantized(src0->type)) { if (src0->type != GGML_TYPE_Q8_0 && src0->type != GGML_TYPE_Q4_0) { @@ -192,14 +180,18 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct g } bool ggml_qnn_compute_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { - auto func = qnn::ggml_qnn_op_array()[tensor->op]; - if (!func) { - QNN_LOG_WARN("unsupported op %d", tensor->op); - return false; + auto unary_op = qnn::ggml_qnn_unary_op_array()[tensor->op]; + if (unary_op) { + return unary_op(ctx, tensor->src[0], tensor); } - func(ctx, tensor->src[0], tensor->src[1], tensor); - return true; + auto binary_op = qnn::ggml_qnn_binary_op_array()[tensor->op]; + if (binary_op) { + return binary_op(ctx, tensor->src[0], tensor->src[1], tensor); + } + + QNN_LOG_WARN("unsupported op %d", tensor->op); + return false; } static const char *ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { @@ -232,7 +224,7 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t QNN_LOG_WARN("Create ggml_qnn_tensor failed"); return; } - + ctx->tensors.push_back(std::move(qnn_tensor)); } @@ -343,6 +335,7 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { auto instance = g_qnn_mgr[ctx->device].instance; if (instance) { + ctx->qnn_unary_graph_cache.clear(); for (const auto &graph_item : ctx->qnn_binary_graph_cache) { QNN_LOG_INFO("graph type:%s", graph_item.first.c_str()); } diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 30f2e402c..a516d8b06 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -12,6 +12,23 @@ namespace { +bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { + if (!ctx || !src || !dst) { + QNN_LOG_WARN("invalid params\n"); + return false; + } + + auto instance = ctx->instance; + auto *tensor0 = qnn::ggml_qnn_tensor::from_ggml_tensor(src); + auto *tensor1 = qnn::ggml_qnn_tensor::from_ggml_tensor(dst); + if (!instance || !tensor0 || !tensor1) { + QNN_LOG_WARN("invalid tensors\n"); + return false; + } + + return true; +} + bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { if (!ctx || !src0 || !src1 || !dst) { @@ -33,15 +50,13 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, } // namespace -#define CHECK_PARAMS(ctx, src0, src1, dst) \ - do { \ - if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ - return; \ - } \ - } while (0) +#define CHECK_PARAMS(ctx, ...) \ + if (!qnn_is_valid_params((ctx), __VA_ARGS__)) { \ + return false; \ + } #else -#define CHECK_PARAMS(ctx, src0, src1, dst) +#define CHECK_PARAMS(ctx, ...) #endif namespace { @@ -125,15 +140,33 @@ bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, return true; } +qnn::ggml_qnn_unary_graph_cache_t &get_qnn_graph_cache(ggml_backend_qnn_context *ctx, + const std::array &inputs, + const std::array &outputs) { + GGML_UNUSED(inputs); + GGML_UNUSED(outputs); + return ctx->qnn_unary_graph_cache; +} + +qnn::ggml_qnn_binary_graph_cache_t &get_qnn_graph_cache(ggml_backend_qnn_context *ctx, + const std::array &inputs, + const std::array &outputs) { + GGML_UNUSED(inputs); + GGML_UNUSED(outputs); + return ctx->qnn_binary_graph_cache; +} + template -qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, ggml_op op, - const std::string &qnn_op, - const std::array &inputs, - const std::array &outputs) { +qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache( + ggml_backend_qnn_context *ctx, ggml_op op, const std::string &qnn_op, + const std::array &inputs, const std::array &outputs) { + using graph_t = qnn::ggml_qnn_graph<_InputSize, _OutputSize>; + + auto &graph_cache = get_qnn_graph_cache(ctx, inputs, outputs); const std::string graph_key(ggml_op_name(op)); - auto it = ctx->qnn_binary_graph_cache.find(graph_key); - qnn::ggml_qnn_graph_binary *graph_ptr = nullptr; - if (it != ctx->qnn_binary_graph_cache.end()) { + auto it = graph_cache.find(graph_key); + graph_t *graph_ptr = nullptr; + if (it != graph_cache.end()) { graph_ptr = it->second.get(); } else { std::string graph_name = graph_key + "_" + std::to_string(ctx->threads); @@ -141,49 +174,49 @@ qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *c graph_name += "_"; graph_name += input->name; } - auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), - ctx->instance->get_qnn_context_handle(), - ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb); + auto graph = + std::make_unique(graph_name, (QNNBackend)(ctx->device), ctx->instance->get_qnn_context_handle(), + ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb); if (!graph->is_valid()) { return nullptr; } - if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), qnn_op.c_str(), inputs, outputs)) { + if (!qnn_bind_tensors_to_graph<_InputSize, _OutputSize>(graph.get(), qnn_op.c_str(), inputs, outputs)) { return nullptr; } graph_ptr = graph.get(); - ctx->qnn_binary_graph_cache[graph_key] = std::move(graph); + graph_cache[graph_key] = std::move(graph); } return graph_ptr; } constexpr const char *kGgmlOpToQnnOp[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - QNN_OP_ELEMENT_WISE_MULTIPLY, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + QNN_OP_ELEMENT_WISE_SUBTRACT, // GGML_OP_SUB + QNN_OP_ELEMENT_WISE_MULTIPLY, // GGML_OP_MUL + QNN_OP_ELEMENT_WISE_DIVIDE, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM QNN_OP_MAT_MUL, // GGML_OP_MUL_MAT nullptr, // GGML_OP_MUL_MAT_ID @@ -249,7 +282,7 @@ static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == GGML_OP_COUN "GGML_OP_COUNT does not match the size of the ops table"); template -void qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, +bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); @@ -270,20 +303,136 @@ void qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, print_ggml_tensor(src1); print_ggml_tensor(dst); } + + return succeed; +} + +template +bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { + static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); + + CHECK_PARAMS(ctx, src, dst); + + qnn::qnn_perf perf(ggml_op_name(_GgmlOp)); + perf.start(); + + bool succeed = false; + auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src }, { dst }); + if (graph_ptr) { + succeed = execute_graph<1, 1>(graph_ptr, { src }, { dst }); + } + + if (!succeed) { + print_ggml_tensor(src); + print_ggml_tensor(dst); + } + + return succeed; } } // namespace -qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() { - static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[] = { +qnn::ggml_qnn_unary_op_array_t qnn::ggml_qnn_unary_op_array() { + static constexpr const qnn::ggml_qnn_unary_op_t kQnnOpsTable[] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + nullptr, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + nullptr, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + qnn_unary_op_impl, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + nullptr, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU + + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + }; + + static_assert(sizeof(kQnnOpsTable) / sizeof(kQnnOpsTable[0]) == GGML_OP_COUNT, + "GGML_OP_COUNT does not match the size of the ops table"); + return kQnnOpsTable; +} + +qnn::ggml_qnn_binary_op_array_t qnn::ggml_qnn_binary_op_array() { + static constexpr const qnn::ggml_qnn_binary_op_t kQnnOpsTable[] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP qnn_binary_op_impl, // GGML_OP_ADD nullptr, // GGML_OP_ADD1 nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB + qnn_binary_op_impl, // GGML_OP_SUB qnn_binary_op_impl, // GGML_OP_MUL - nullptr, // GGML_OP_DIV + qnn_binary_op_impl, // GGML_OP_DIV nullptr, // GGML_OP_SQR nullptr, // GGML_OP_SQRT nullptr, // GGML_OP_LOG diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index 01c23ecff..8d94fc6c2 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -6,11 +6,14 @@ namespace qnn { -typedef void (*ggml_qnn_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst); +typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst); +typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst); -typedef const ggml_qnn_op_t (&ggml_qnn_op_array_t)[GGML_OP_COUNT]; +typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT]; +typedef const ggml_qnn_binary_op_t (&ggml_qnn_binary_op_array_t)[GGML_OP_COUNT]; -ggml_qnn_op_array_t ggml_qnn_op_array(); +ggml_qnn_unary_op_array_t ggml_qnn_unary_op_array(); +ggml_qnn_binary_op_array_t ggml_qnn_binary_op_array(); } // namespace qnn diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index 48b243577..0ec927779 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -11,6 +11,11 @@ #include "graph.hpp" #include "qnn.hpp" +namespace qnn { +typedef std::unordered_map> ggml_qnn_unary_graph_cache_t; +typedef std::unordered_map> ggml_qnn_binary_graph_cache_t; +} // namespace qnn + struct ggml_backend_qnn_context { int device; int threads; @@ -21,5 +26,6 @@ struct ggml_backend_qnn_context { QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; qnn::qcom_socinfo socinfo; - std::unordered_map> qnn_binary_graph_cache; + qnn::ggml_qnn_unary_graph_cache_t qnn_unary_graph_cache; + qnn::ggml_qnn_binary_graph_cache_t qnn_binary_graph_cache; };