add unary op template and more ops

2024-07-13 00:06:58 +08:00 · 2024-07-13 00:06:58 +08:00 · 100ccd5e7f
commit 100ccd5e7f
parent 7cbc4fbd8c
4 changed files with 225 additions and 74 deletions
--- a/ggml/src/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn.cpp
@ -1,6 +1,5 @@
 #include "ggml-qnn.h"
 #include <list>
 #include <stdatomic.h>
 #include <stdio.h>
 #include <stdlib.h>
@ -15,6 +14,7 @@
 #include <fstream>
 #include <functional>
 #include <iostream>
 #include <list>
 #include <memory>
 #include <mutex>
 #include <queue>
@ -142,7 +142,8 @@ struct ggml_backend_qnn_buffer_type_context {
 // =================================================================================================
 static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct ggml_tensor *tensor,
                                   bool b_dump_tensor_info) {
-    if (ggml_is_empty(tensor) || !qnn::ggml_qnn_op_array()[tensor->op]) {
+    if (ggml_is_empty(tensor) || 
        (!qnn::ggml_qnn_unary_op_array()[tensor->op] && !qnn::ggml_qnn_binary_op_array()[tensor->op])) {
        return false;
    }
@ -161,19 +162,6 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct g
        return false;
    }
    // TODO: support other GGML OPs using QNN API
    // a GENERAL approach could fix this problem in a standalone PR of refine ggml backend
    // subsystem for hybrid inference between CPU&GPU / CPU&NPU easily(less the 100 LoC and no
    // side-effect to the existing codes) for ANY ggml backends which the backend's
    // ggml_backend_xxx_buffer_is_host return true. this approach could be found at:
    // https://github.com/ggerganov/llama.cpp/pull/7641
    bool supported_op = false;
    supported_op = (tensor->op == GGML_OP_ADD);
    supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL_MAT));
    if (!supported_op) {
        return false;
    }
    // TODO: support other quantized data type
    if (ggml_is_quantized(src0->type)) {
        if (src0->type != GGML_TYPE_Q8_0 && src0->type != GGML_TYPE_Q4_0) {
@ -192,14 +180,18 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct g
 }
 bool ggml_qnn_compute_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) {
-    auto func = qnn::ggml_qnn_op_array()[tensor->op];
+    auto unary_op = qnn::ggml_qnn_unary_op_array()[tensor->op];
-    if (!func) {
+    if (unary_op) {
-        QNN_LOG_WARN("unsupported op %d", tensor->op);
+        return unary_op(ctx, tensor->src[0], tensor);
        return false;
    }
-    func(ctx, tensor->src[0], tensor->src[1], tensor);
+    auto binary_op = qnn::ggml_qnn_binary_op_array()[tensor->op];
-    return true;
+    if (binary_op) {
        return binary_op(ctx, tensor->src[0], tensor->src[1], tensor);
    }
    QNN_LOG_WARN("unsupported op %d", tensor->op);
    return false;
 }
 static const char *ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) {
@ -232,7 +224,7 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t
        QNN_LOG_WARN("Create ggml_qnn_tensor failed");
        return;
    }
-    
+
    ctx->tensors.push_back(std::move(qnn_tensor));
 }
@ -343,6 +335,7 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) {
    auto instance = g_qnn_mgr[ctx->device].instance;
    if (instance) {
        ctx->qnn_unary_graph_cache.clear();
        for (const auto &graph_item : ctx->qnn_binary_graph_cache) {
            QNN_LOG_INFO("graph type:%s", graph_item.first.c_str());
        }
--- a/ggml/src/ggml-qnn/backend-ops.cpp
+++ b/ggml/src/ggml-qnn/backend-ops.cpp
@ -12,6 +12,23 @@
 namespace {
 bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst) {
    if (!ctx || !src || !dst) {
        QNN_LOG_WARN("invalid params\n");
        return false;
    }
    auto instance = ctx->instance;
    auto *tensor0 = qnn::ggml_qnn_tensor::from_ggml_tensor(src);
    auto *tensor1 = qnn::ggml_qnn_tensor::from_ggml_tensor(dst);
    if (!instance || !tensor0 || !tensor1) {
        QNN_LOG_WARN("invalid tensors\n");
        return false;
    }
    return true;
 }
 bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1,
                         ggml_tensor *dst) {
    if (!ctx || !src0 || !src1 || !dst) {
@ -33,15 +50,13 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0,
 } // namespace
-#define CHECK_PARAMS(ctx, src0, src1, dst)                        \
+#define CHECK_PARAMS(ctx, ...)                      \
-    do {                                                          \
+    if (!qnn_is_valid_params((ctx), __VA_ARGS__)) { \
-        if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \
+        return false;                               \
-            return;                                               \
+    }
        }                                                         \
    } while (0)
 #else
-#define CHECK_PARAMS(ctx, src0, src1, dst)
+#define CHECK_PARAMS(ctx, ...)
 #endif
 namespace {
@ -125,15 +140,33 @@ bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph,
    return true;
 }
 qnn::ggml_qnn_unary_graph_cache_t &get_qnn_graph_cache(ggml_backend_qnn_context *ctx,
                                                       const std::array<const ggml_tensor *, 1> &inputs,
                                                       const std::array<ggml_tensor *, 1> &outputs) {
    GGML_UNUSED(inputs);
    GGML_UNUSED(outputs);
    return ctx->qnn_unary_graph_cache;
 }
 qnn::ggml_qnn_binary_graph_cache_t &get_qnn_graph_cache(ggml_backend_qnn_context *ctx,
                                                        const std::array<const ggml_tensor *, 2> &inputs,
                                                        const std::array<ggml_tensor *, 1> &outputs) {
    GGML_UNUSED(inputs);
    GGML_UNUSED(outputs);
    return ctx->qnn_binary_graph_cache;
 }
 template <size_t _InputSize, size_t _OutputSize>
-qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, ggml_op op,
+qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache(
-                                                     const std::string &qnn_op,
+    ggml_backend_qnn_context *ctx, ggml_op op, const std::string &qnn_op,
-                                                     const std::array<const ggml_tensor *, _InputSize> &inputs,
+    const std::array<const ggml_tensor *, _InputSize> &inputs, const std::array<ggml_tensor *, _OutputSize> &outputs) {
-                                                     const std::array<ggml_tensor *, _OutputSize> &outputs) {
+    using graph_t = qnn::ggml_qnn_graph<_InputSize, _OutputSize>;
    auto &graph_cache = get_qnn_graph_cache(ctx, inputs, outputs);
    const std::string graph_key(ggml_op_name(op));
-    auto it = ctx->qnn_binary_graph_cache.find(graph_key);
+    auto it = graph_cache.find(graph_key);
-    qnn::ggml_qnn_graph_binary *graph_ptr = nullptr;
+    graph_t *graph_ptr = nullptr;
-    if (it != ctx->qnn_binary_graph_cache.end()) {
+    if (it != graph_cache.end()) {
        graph_ptr = it->second.get();
    } else {
        std::string graph_name = graph_key + "_" + std::to_string(ctx->threads);
@ -141,49 +174,49 @@ qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *c
            graph_name += "_";
            graph_name += input->name;
        }
-        auto graph = std::make_unique<qnn::ggml_qnn_graph_binary>(graph_name, (QNNBackend)(ctx->device),
+        auto graph =
-                                                                  ctx->instance->get_qnn_context_handle(),
+            std::make_unique<graph_t>(graph_name, (QNNBackend)(ctx->device), ctx->instance->get_qnn_context_handle(),
-                                                                  ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb);
+                                      ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb);
        if (!graph->is_valid()) {
            return nullptr;
        }
-        if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), qnn_op.c_str(), inputs, outputs)) {
+        if (!qnn_bind_tensors_to_graph<_InputSize, _OutputSize>(graph.get(), qnn_op.c_str(), inputs, outputs)) {
            return nullptr;
        }
        graph_ptr = graph.get();
-        ctx->qnn_binary_graph_cache[graph_key] = std::move(graph);
+        graph_cache[graph_key] = std::move(graph);
    }
    return graph_ptr;
 }
 constexpr const char *kGgmlOpToQnnOp[] = {
-    nullptr,                      // GGML_OP_NONE
+    nullptr,                         // GGML_OP_NONE
-    nullptr,                      // GGML_OP_DUP
+    nullptr,                         // GGML_OP_DUP
-    QNN_OP_ELEMENT_WISE_ADD,      // GGML_OP_ADD
+    QNN_OP_ELEMENT_WISE_ADD,         // GGML_OP_ADD
-    nullptr,                      // GGML_OP_ADD1
+    nullptr,                         // GGML_OP_ADD1
-    nullptr,                      // GGML_OP_ACC
+    nullptr,                         // GGML_OP_ACC
-    nullptr,                      // GGML_OP_SUB
+    QNN_OP_ELEMENT_WISE_SUBTRACT,    // GGML_OP_SUB
-    QNN_OP_ELEMENT_WISE_MULTIPLY, // GGML_OP_MUL
+    QNN_OP_ELEMENT_WISE_MULTIPLY,    // GGML_OP_MUL
-    nullptr,                      // GGML_OP_DIV
+    QNN_OP_ELEMENT_WISE_DIVIDE,      // GGML_OP_DIV
-    nullptr,                      // GGML_OP_SQR
+    nullptr,                         // GGML_OP_SQR
-    nullptr,                      // GGML_OP_SQRT
+    QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // GGML_OP_SQRT
-    nullptr,                      // GGML_OP_LOG
+    nullptr,                         // GGML_OP_LOG
-    nullptr,                      // GGML_OP_SUM
+    nullptr,                         // GGML_OP_SUM
-    nullptr,                      // GGML_OP_SUM_ROWS
+    nullptr,                         // GGML_OP_SUM_ROWS
-    nullptr,                      // GGML_OP_MEAN
+    nullptr,                         // GGML_OP_MEAN
-    nullptr,                      // GGML_OP_ARGMAX
+    nullptr,                         // GGML_OP_ARGMAX
-    nullptr,                      // GGML_OP_REPEAT
+    nullptr,                         // GGML_OP_REPEAT
-    nullptr,                      // GGML_OP_REPEAT_BACK
+    nullptr,                         // GGML_OP_REPEAT_BACK
-    nullptr,                      // GGML_OP_CONCAT
+    nullptr,                         // GGML_OP_CONCAT
-    nullptr,                      // GGML_OP_SILU_BACK
+    nullptr,                         // GGML_OP_SILU_BACK
-    nullptr,                      // GGML_OP_NORM
+    nullptr,                         // GGML_OP_NORM
-    nullptr,                      // GGML_OP_RMS_NORM
+    nullptr,                         // GGML_OP_RMS_NORM
-    nullptr,                      // GGML_OP_RMS_NORM_BACK
+    nullptr,                         // GGML_OP_RMS_NORM_BACK
-    nullptr,                      // GGML_OP_GROUP_NORM
+    nullptr,                         // GGML_OP_GROUP_NORM
    QNN_OP_MAT_MUL, // GGML_OP_MUL_MAT
    nullptr,        // GGML_OP_MUL_MAT_ID
@ -249,7 +282,7 @@ static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == GGML_OP_COUN
              "GGML_OP_COUNT does not match the size of the ops table");
 template <ggml_op _GgmlOp>
-void qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1,
                        ggml_tensor *dst) {
    static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP");
@ -270,20 +303,136 @@ void qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0,
        print_ggml_tensor(src1);
        print_ggml_tensor(dst);
    }
    return succeed;
 }
 template <ggml_op _GgmlOp>
 bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst) {
    static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP");
    CHECK_PARAMS(ctx, src, dst);
    qnn::qnn_perf perf(ggml_op_name(_GgmlOp));
    perf.start();
    bool succeed = false;
    auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src }, { dst });
    if (graph_ptr) {
        succeed = execute_graph<1, 1>(graph_ptr, { src }, { dst });
    }
    if (!succeed) {
        print_ggml_tensor(src);
        print_ggml_tensor(dst);
    }
    return succeed;
 }
 } // namespace
-qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() {
+qnn::ggml_qnn_unary_op_array_t qnn::ggml_qnn_unary_op_array() {
-    static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[] = {
+    static constexpr const qnn::ggml_qnn_unary_op_t kQnnOpsTable[] = {
        nullptr,                         // GGML_OP_NONE
        nullptr,                         // GGML_OP_DUP
        nullptr,                         // GGML_OP_ADD
        nullptr,                         // GGML_OP_ADD1
        nullptr,                         // GGML_OP_ACC
        nullptr,                         // GGML_OP_SUB
        nullptr,                         // GGML_OP_MUL
        nullptr,                         // GGML_OP_DIV
        nullptr,                         // GGML_OP_SQR
        qnn_unary_op_impl<GGML_OP_SQRT>, // GGML_OP_SQRT
        nullptr,                         // GGML_OP_LOG
        nullptr,                         // GGML_OP_SUM
        nullptr,                         // GGML_OP_SUM_ROWS
        nullptr,                         // GGML_OP_MEAN
        nullptr,                         // GGML_OP_ARGMAX
        nullptr,                         // GGML_OP_REPEAT
        nullptr,                         // GGML_OP_REPEAT_BACK
        nullptr,                         // GGML_OP_CONCAT
        nullptr,                         // GGML_OP_SILU_BACK
        nullptr,                         // GGML_OP_NORM
        nullptr,                         // GGML_OP_RMS_NORM
        nullptr,                         // GGML_OP_RMS_NORM_BACK
        nullptr,                         // GGML_OP_GROUP_NORM
        nullptr, // GGML_OP_MUL_MAT
        nullptr, // GGML_OP_MUL_MAT_ID
        nullptr, // GGML_OP_OUT_PROD
        nullptr, // GGML_OP_SCALE
        nullptr, // GGML_OP_SET
        nullptr, // GGML_OP_CPY
        nullptr, // GGML_OP_CONT
        nullptr, // GGML_OP_RESHAPE
        nullptr, // GGML_OP_VIEW
        nullptr, // GGML_OP_PERMUTE
        nullptr, // GGML_OP_TRANSPOSE
        nullptr, // GGML_OP_GET_ROWS
        nullptr, // GGML_OP_GET_ROWS_BACK
        nullptr, // GGML_OP_DIAG
        nullptr, // GGML_OP_DIAG_MASK_INF
        nullptr, // GGML_OP_DIAG_MASK_ZERO
        nullptr, // GGML_OP_SOFT_MAX
        nullptr, // GGML_OP_SOFT_MAX_BACK
        nullptr, // GGML_OP_ROPE
        nullptr, // GGML_OP_ROPE_BACK
        nullptr, // GGML_OP_CLAMP
        nullptr, // GGML_OP_CONV_TRANSPOSE_1D
        nullptr, // GGML_OP_IM2COL
        nullptr, // GGML_OP_CONV_TRANSPOSE_2D
        nullptr, // GGML_OP_POOL_1D
        nullptr, // GGML_OP_POOL_2D
        nullptr, // GGML_OP_UPSCALE
        nullptr, // GGML_OP_PAD
        nullptr, // GGML_OP_ARANGE
        nullptr, // GGML_OP_TIMESTEP_EMBEDDING
        nullptr, // GGML_OP_ARGSORT
        nullptr, // GGML_OP_LEAKY_RELU
        nullptr, // GGML_OP_FLASH_ATTN_EXT
        nullptr, // GGML_OP_FLASH_ATTN_BACK
        nullptr, // GGML_OP_SSM_CONV
        nullptr, // GGML_OP_SSM_SCAN
        nullptr, // GGML_OP_WIN_PART
        nullptr, // GGML_OP_WIN_UNPART
        nullptr, // GGML_OP_GET_REL_POS
        nullptr, // GGML_OP_ADD_REL_POS
        nullptr, // GGML_OP_UNARY
        nullptr, // GGML_OP_MAP_UNARY
        nullptr, // GGML_OP_MAP_BINARY
        nullptr, // GGML_OP_MAP_CUSTOM1_F32
        nullptr, // GGML_OP_MAP_CUSTOM2_F32
        nullptr, // GGML_OP_MAP_CUSTOM3_F32
        nullptr, // GGML_OP_MAP_CUSTOM1
        nullptr, // GGML_OP_MAP_CUSTOM2
        nullptr, // GGML_OP_MAP_CUSTOM3
        nullptr, // GGML_OP_CROSS_ENTROPY_LOSS
        nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
    };
    static_assert(sizeof(kQnnOpsTable) / sizeof(kQnnOpsTable[0]) == GGML_OP_COUNT,
                  "GGML_OP_COUNT does not match the size of the ops table");
    return kQnnOpsTable;
 }
 qnn::ggml_qnn_binary_op_array_t qnn::ggml_qnn_binary_op_array() {
    static constexpr const qnn::ggml_qnn_binary_op_t kQnnOpsTable[] = {
        nullptr,                         // GGML_OP_NONE
        nullptr,                         // GGML_OP_DUP
        qnn_binary_op_impl<GGML_OP_ADD>, // GGML_OP_ADD
        nullptr,                         // GGML_OP_ADD1
        nullptr,                         // GGML_OP_ACC
-        nullptr,                         // GGML_OP_SUB
+        qnn_binary_op_impl<GGML_OP_SUB>, // GGML_OP_SUB
        qnn_binary_op_impl<GGML_OP_MUL>, // GGML_OP_MUL
-        nullptr,                         // GGML_OP_DIV
+        qnn_binary_op_impl<GGML_OP_DIV>, // GGML_OP_DIV
        nullptr,                         // GGML_OP_SQR
        nullptr,                         // GGML_OP_SQRT
        nullptr,                         // GGML_OP_LOG
--- a/ggml/src/ggml-qnn/backend-ops.hpp
+++ b/ggml/src/ggml-qnn/backend-ops.hpp
@ -6,11 +6,14 @@
 namespace qnn {
-typedef void (*ggml_qnn_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst);
-                              ggml_tensor *dst);
+typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1,
                                     ggml_tensor *dst);
-typedef const ggml_qnn_op_t (&ggml_qnn_op_array_t)[GGML_OP_COUNT];
+typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT];
 typedef const ggml_qnn_binary_op_t (&ggml_qnn_binary_op_array_t)[GGML_OP_COUNT];
-ggml_qnn_op_array_t ggml_qnn_op_array();
+ggml_qnn_unary_op_array_t ggml_qnn_unary_op_array();
 ggml_qnn_binary_op_array_t ggml_qnn_binary_op_array();
 } // namespace qnn
--- a/ggml/src/ggml-qnn/backend.hpp
+++ b/ggml/src/ggml-qnn/backend.hpp
@ -11,6 +11,11 @@
 #include "graph.hpp"
 #include "qnn.hpp"
 namespace qnn {
 typedef std::unordered_map<std::string, std::unique_ptr<qnn::ggml_qnn_graph_unary>> ggml_qnn_unary_graph_cache_t;
 typedef std::unordered_map<std::string, std::unique_ptr<qnn::ggml_qnn_graph_binary>> ggml_qnn_binary_graph_cache_t;
 } // namespace qnn
 struct ggml_backend_qnn_context {
    int device;
    int threads;
@ -21,5 +26,6 @@ struct ggml_backend_qnn_context {
    QNN_INTERFACE_VER_TYPE raw_interface;
    QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
    qnn::qcom_socinfo socinfo;
-    std::unordered_map<std::string, std::unique_ptr<qnn::ggml_qnn_graph_binary>> qnn_binary_graph_cache;
+    qnn::ggml_qnn_unary_graph_cache_t qnn_unary_graph_cache;
    qnn::ggml_qnn_binary_graph_cache_t qnn_binary_graph_cache;
 };