From 100ccd5e7fb5bafa92d57ea87108461f91bcfcc6 Mon Sep 17 00:00:00 2001
From: hongruichen <chraac@gmail.com>
Date: Sat, 13 Jul 2024 00:06:58 +0800
Subject: [PATCH] add unary op template and more ops

---
 ggml/src/ggml-qnn.cpp             |  37 ++---
 ggml/src/ggml-qnn/backend-ops.cpp | 243 ++++++++++++++++++++++++------
 ggml/src/ggml-qnn/backend-ops.hpp |  11 +-
 ggml/src/ggml-qnn/backend.hpp     |   8 +-
 4 files changed, 225 insertions(+), 74 deletions(-)
diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp
index 3584c4112..de1fefe49 100644
--- a/ggml/src/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn.cpp
@@ -1,6 +1,5 @@
 #include "ggml-qnn.h"
 
-#include <list>
 #include <stdatomic.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -15,6 +14,7 @@
 #include <fstream>
 #include <functional>
 #include <iostream>
+#include <list>
 #include <memory>
 #include <mutex>
 #include <queue>
@@ -142,7 +142,8 @@ struct ggml_backend_qnn_buffer_type_context {
 // =================================================================================================
 static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct ggml_tensor *tensor,
                                    bool b_dump_tensor_info) {
-    if (ggml_is_empty(tensor) || !qnn::ggml_qnn_op_array()[tensor->op]) {
+    if (ggml_is_empty(tensor) || 
+        (!qnn::ggml_qnn_unary_op_array()[tensor->op] && !qnn::ggml_qnn_binary_op_array()[tensor->op])) {
         return false;
     }
 
@@ -161,19 +162,6 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct g
         return false;
     }
 
-    // TODO: support other GGML OPs using QNN API
-    // a GENERAL approach could fix this problem in a standalone PR of refine ggml backend
-    // subsystem for hybrid inference between CPU&GPU / CPU&NPU easily(less the 100 LoC and no
-    // side-effect to the existing codes) for ANY ggml backends which the backend's
-    // ggml_backend_xxx_buffer_is_host return true. this approach could be found at:
-    // https://github.com/ggerganov/llama.cpp/pull/7641
-    bool supported_op = false;
-    supported_op = (tensor->op == GGML_OP_ADD);
-    supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL_MAT));
-    if (!supported_op) {
-        return false;
-    }
-
     // TODO: support other quantized data type
     if (ggml_is_quantized(src0->type)) {
         if (src0->type != GGML_TYPE_Q8_0 && src0->type != GGML_TYPE_Q4_0) {
@@ -192,14 +180,18 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct g
 }
 
 bool ggml_qnn_compute_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) {
-    auto func = qnn::ggml_qnn_op_array()[tensor->op];
-    if (!func) {
-        QNN_LOG_WARN("unsupported op %d", tensor->op);
-        return false;
+    auto unary_op = qnn::ggml_qnn_unary_op_array()[tensor->op];
+    if (unary_op) {
+        return unary_op(ctx, tensor->src[0], tensor);
     }
 
-    func(ctx, tensor->src[0], tensor->src[1], tensor);
-    return true;
+    auto binary_op = qnn::ggml_qnn_binary_op_array()[tensor->op];
+    if (binary_op) {
+        return binary_op(ctx, tensor->src[0], tensor->src[1], tensor);
+    }
+
+    QNN_LOG_WARN("unsupported op %d", tensor->op);
+    return false;
 }
 
 static const char *ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) {
@@ -232,7 +224,7 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t
         QNN_LOG_WARN("Create ggml_qnn_tensor failed");
         return;
     }
-    
+
     ctx->tensors.push_back(std::move(qnn_tensor));
 }
 
@@ -343,6 +335,7 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) {
 
     auto instance = g_qnn_mgr[ctx->device].instance;
     if (instance) {
+        ctx->qnn_unary_graph_cache.clear();
         for (const auto &graph_item : ctx->qnn_binary_graph_cache) {
             QNN_LOG_INFO("graph type:%s", graph_item.first.c_str());
         }
diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp
index 30f2e402c..a516d8b06 100644
--- a/ggml/src/ggml-qnn/backend-ops.cpp
+++ b/ggml/src/ggml-qnn/backend-ops.cpp
@@ -12,6 +12,23 @@
 
 namespace {
 
+bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst) {
+    if (!ctx || !src || !dst) {
+        QNN_LOG_WARN("invalid params\n");
+        return false;
+    }
+
+    auto instance = ctx->instance;
+    auto *tensor0 = qnn::ggml_qnn_tensor::from_ggml_tensor(src);
+    auto *tensor1 = qnn::ggml_qnn_tensor::from_ggml_tensor(dst);
+    if (!instance || !tensor0 || !tensor1) {
+        QNN_LOG_WARN("invalid tensors\n");
+        return false;
+    }
+
+    return true;
+}
+
 bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1,
                          ggml_tensor *dst) {
     if (!ctx || !src0 || !src1 || !dst) {
@@ -33,15 +50,13 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0,
 
 } // namespace
 
-#define CHECK_PARAMS(ctx, src0, src1, dst)                        \
-    do {                                                          \
-        if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \
-            return;                                               \
-        }                                                         \
-    } while (0)
+#define CHECK_PARAMS(ctx, ...)                      \
+    if (!qnn_is_valid_params((ctx), __VA_ARGS__)) { \
+        return false;                               \
+    }
 
 #else
-#define CHECK_PARAMS(ctx, src0, src1, dst)
+#define CHECK_PARAMS(ctx, ...)
 #endif
 
 namespace {
@@ -125,15 +140,33 @@ bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph,
     return true;
 }
 
+qnn::ggml_qnn_unary_graph_cache_t &get_qnn_graph_cache(ggml_backend_qnn_context *ctx,
+                                                       const std::array<const ggml_tensor *, 1> &inputs,
+                                                       const std::array<ggml_tensor *, 1> &outputs) {
+    GGML_UNUSED(inputs);
+    GGML_UNUSED(outputs);
+    return ctx->qnn_unary_graph_cache;
+}
+
+qnn::ggml_qnn_binary_graph_cache_t &get_qnn_graph_cache(ggml_backend_qnn_context *ctx,
+                                                        const std::array<const ggml_tensor *, 2> &inputs,
+                                                        const std::array<ggml_tensor *, 1> &outputs) {
+    GGML_UNUSED(inputs);
+    GGML_UNUSED(outputs);
+    return ctx->qnn_binary_graph_cache;
+}
+
 template <size_t _InputSize, size_t _OutputSize>
-qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, ggml_op op,
-                                                     const std::string &qnn_op,
-                                                     const std::array<const ggml_tensor *, _InputSize> &inputs,
-                                                     const std::array<ggml_tensor *, _OutputSize> &outputs) {
+qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache(
+    ggml_backend_qnn_context *ctx, ggml_op op, const std::string &qnn_op,
+    const std::array<const ggml_tensor *, _InputSize> &inputs, const std::array<ggml_tensor *, _OutputSize> &outputs) {
+    using graph_t = qnn::ggml_qnn_graph<_InputSize, _OutputSize>;
+
+    auto &graph_cache = get_qnn_graph_cache(ctx, inputs, outputs);
     const std::string graph_key(ggml_op_name(op));
-    auto it = ctx->qnn_binary_graph_cache.find(graph_key);
-    qnn::ggml_qnn_graph_binary *graph_ptr = nullptr;
-    if (it != ctx->qnn_binary_graph_cache.end()) {
+    auto it = graph_cache.find(graph_key);
+    graph_t *graph_ptr = nullptr;
+    if (it != graph_cache.end()) {
         graph_ptr = it->second.get();
     } else {
         std::string graph_name = graph_key + "_" + std::to_string(ctx->threads);
@@ -141,49 +174,49 @@ qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *c
             graph_name += "_";
             graph_name += input->name;
         }
-        auto graph = std::make_unique<qnn::ggml_qnn_graph_binary>(graph_name, (QNNBackend)(ctx->device),
-                                                                  ctx->instance->get_qnn_context_handle(),
-                                                                  ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb);
+        auto graph =
+            std::make_unique<graph_t>(graph_name, (QNNBackend)(ctx->device), ctx->instance->get_qnn_context_handle(),
+                                      ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb);
 
         if (!graph->is_valid()) {
             return nullptr;
         }
 
-        if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), qnn_op.c_str(), inputs, outputs)) {
+        if (!qnn_bind_tensors_to_graph<_InputSize, _OutputSize>(graph.get(), qnn_op.c_str(), inputs, outputs)) {
             return nullptr;
         }
 
         graph_ptr = graph.get();
-        ctx->qnn_binary_graph_cache[graph_key] = std::move(graph);
+        graph_cache[graph_key] = std::move(graph);
     }
 
     return graph_ptr;
 }
 
 constexpr const char *kGgmlOpToQnnOp[] = {
-    nullptr,                      // GGML_OP_NONE
-    nullptr,                      // GGML_OP_DUP
-    QNN_OP_ELEMENT_WISE_ADD,      // GGML_OP_ADD
-    nullptr,                      // GGML_OP_ADD1
-    nullptr,                      // GGML_OP_ACC
-    nullptr,                      // GGML_OP_SUB
-    QNN_OP_ELEMENT_WISE_MULTIPLY, // GGML_OP_MUL
-    nullptr,                      // GGML_OP_DIV
-    nullptr,                      // GGML_OP_SQR
-    nullptr,                      // GGML_OP_SQRT
-    nullptr,                      // GGML_OP_LOG
-    nullptr,                      // GGML_OP_SUM
-    nullptr,                      // GGML_OP_SUM_ROWS
-    nullptr,                      // GGML_OP_MEAN
-    nullptr,                      // GGML_OP_ARGMAX
-    nullptr,                      // GGML_OP_REPEAT
-    nullptr,                      // GGML_OP_REPEAT_BACK
-    nullptr,                      // GGML_OP_CONCAT
-    nullptr,                      // GGML_OP_SILU_BACK
-    nullptr,                      // GGML_OP_NORM
-    nullptr,                      // GGML_OP_RMS_NORM
-    nullptr,                      // GGML_OP_RMS_NORM_BACK
-    nullptr,                      // GGML_OP_GROUP_NORM
+    nullptr,                         // GGML_OP_NONE
+    nullptr,                         // GGML_OP_DUP
+    QNN_OP_ELEMENT_WISE_ADD,         // GGML_OP_ADD
+    nullptr,                         // GGML_OP_ADD1
+    nullptr,                         // GGML_OP_ACC
+    QNN_OP_ELEMENT_WISE_SUBTRACT,    // GGML_OP_SUB
+    QNN_OP_ELEMENT_WISE_MULTIPLY,    // GGML_OP_MUL
+    QNN_OP_ELEMENT_WISE_DIVIDE,      // GGML_OP_DIV
+    nullptr,                         // GGML_OP_SQR
+    QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // GGML_OP_SQRT
+    nullptr,                         // GGML_OP_LOG
+    nullptr,                         // GGML_OP_SUM
+    nullptr,                         // GGML_OP_SUM_ROWS
+    nullptr,                         // GGML_OP_MEAN
+    nullptr,                         // GGML_OP_ARGMAX
+    nullptr,                         // GGML_OP_REPEAT
+    nullptr,                         // GGML_OP_REPEAT_BACK
+    nullptr,                         // GGML_OP_CONCAT
+    nullptr,                         // GGML_OP_SILU_BACK
+    nullptr,                         // GGML_OP_NORM
+    nullptr,                         // GGML_OP_RMS_NORM
+    nullptr,                         // GGML_OP_RMS_NORM_BACK
+    nullptr,                         // GGML_OP_GROUP_NORM
 
     QNN_OP_MAT_MUL, // GGML_OP_MUL_MAT
     nullptr,        // GGML_OP_MUL_MAT_ID
@@ -249,7 +282,7 @@ static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == GGML_OP_COUN
               "GGML_OP_COUNT does not match the size of the ops table");
 
 template <ggml_op _GgmlOp>
-void qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1,
                         ggml_tensor *dst) {
     static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP");
 
@@ -270,20 +303,136 @@ void qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0,
         print_ggml_tensor(src1);
         print_ggml_tensor(dst);
     }
+
+    return succeed;
+}
+
+template <ggml_op _GgmlOp>
+bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst) {
+    static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP");
+
+    CHECK_PARAMS(ctx, src, dst);
+
+    qnn::qnn_perf perf(ggml_op_name(_GgmlOp));
+    perf.start();
+
+    bool succeed = false;
+    auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src }, { dst });
+    if (graph_ptr) {
+        succeed = execute_graph<1, 1>(graph_ptr, { src }, { dst });
+    }
+
+    if (!succeed) {
+        print_ggml_tensor(src);
+        print_ggml_tensor(dst);
+    }
+
+    return succeed;
 }
 
 } // namespace
 
-qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() {
-    static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[] = {
+qnn::ggml_qnn_unary_op_array_t qnn::ggml_qnn_unary_op_array() {
+    static constexpr const qnn::ggml_qnn_unary_op_t kQnnOpsTable[] = {
+        nullptr,                         // GGML_OP_NONE
+        nullptr,                         // GGML_OP_DUP
+        nullptr,                         // GGML_OP_ADD
+        nullptr,                         // GGML_OP_ADD1
+        nullptr,                         // GGML_OP_ACC
+        nullptr,                         // GGML_OP_SUB
+        nullptr,                         // GGML_OP_MUL
+        nullptr,                         // GGML_OP_DIV
+        nullptr,                         // GGML_OP_SQR
+        qnn_unary_op_impl<GGML_OP_SQRT>, // GGML_OP_SQRT
+        nullptr,                         // GGML_OP_LOG
+        nullptr,                         // GGML_OP_SUM
+        nullptr,                         // GGML_OP_SUM_ROWS
+        nullptr,                         // GGML_OP_MEAN
+        nullptr,                         // GGML_OP_ARGMAX
+        nullptr,                         // GGML_OP_REPEAT
+        nullptr,                         // GGML_OP_REPEAT_BACK
+        nullptr,                         // GGML_OP_CONCAT
+        nullptr,                         // GGML_OP_SILU_BACK
+        nullptr,                         // GGML_OP_NORM
+        nullptr,                         // GGML_OP_RMS_NORM
+        nullptr,                         // GGML_OP_RMS_NORM_BACK
+        nullptr,                         // GGML_OP_GROUP_NORM
+
+        nullptr, // GGML_OP_MUL_MAT
+        nullptr, // GGML_OP_MUL_MAT_ID
+        nullptr, // GGML_OP_OUT_PROD
+
+        nullptr, // GGML_OP_SCALE
+        nullptr, // GGML_OP_SET
+        nullptr, // GGML_OP_CPY
+        nullptr, // GGML_OP_CONT
+        nullptr, // GGML_OP_RESHAPE
+        nullptr, // GGML_OP_VIEW
+        nullptr, // GGML_OP_PERMUTE
+        nullptr, // GGML_OP_TRANSPOSE
+        nullptr, // GGML_OP_GET_ROWS
+        nullptr, // GGML_OP_GET_ROWS_BACK
+        nullptr, // GGML_OP_DIAG
+        nullptr, // GGML_OP_DIAG_MASK_INF
+        nullptr, // GGML_OP_DIAG_MASK_ZERO
+        nullptr, // GGML_OP_SOFT_MAX
+        nullptr, // GGML_OP_SOFT_MAX_BACK
+        nullptr, // GGML_OP_ROPE
+        nullptr, // GGML_OP_ROPE_BACK
+        nullptr, // GGML_OP_CLAMP
+        nullptr, // GGML_OP_CONV_TRANSPOSE_1D
+        nullptr, // GGML_OP_IM2COL
+        nullptr, // GGML_OP_CONV_TRANSPOSE_2D
+        nullptr, // GGML_OP_POOL_1D
+        nullptr, // GGML_OP_POOL_2D
+        nullptr, // GGML_OP_UPSCALE
+        nullptr, // GGML_OP_PAD
+        nullptr, // GGML_OP_ARANGE
+        nullptr, // GGML_OP_TIMESTEP_EMBEDDING
+        nullptr, // GGML_OP_ARGSORT
+        nullptr, // GGML_OP_LEAKY_RELU
+
+        nullptr, // GGML_OP_FLASH_ATTN_EXT
+        nullptr, // GGML_OP_FLASH_ATTN_BACK
+        nullptr, // GGML_OP_SSM_CONV
+        nullptr, // GGML_OP_SSM_SCAN
+        nullptr, // GGML_OP_WIN_PART
+        nullptr, // GGML_OP_WIN_UNPART
+        nullptr, // GGML_OP_GET_REL_POS
+        nullptr, // GGML_OP_ADD_REL_POS
+
+        nullptr, // GGML_OP_UNARY
+
+        nullptr, // GGML_OP_MAP_UNARY
+        nullptr, // GGML_OP_MAP_BINARY
+
+        nullptr, // GGML_OP_MAP_CUSTOM1_F32
+        nullptr, // GGML_OP_MAP_CUSTOM2_F32
+        nullptr, // GGML_OP_MAP_CUSTOM3_F32
+
+        nullptr, // GGML_OP_MAP_CUSTOM1
+        nullptr, // GGML_OP_MAP_CUSTOM2
+        nullptr, // GGML_OP_MAP_CUSTOM3
+
+        nullptr, // GGML_OP_CROSS_ENTROPY_LOSS
+        nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
+    };
+
+    static_assert(sizeof(kQnnOpsTable) / sizeof(kQnnOpsTable[0]) == GGML_OP_COUNT,
+                  "GGML_OP_COUNT does not match the size of the ops table");
+    return kQnnOpsTable;
+}
+
+qnn::ggml_qnn_binary_op_array_t qnn::ggml_qnn_binary_op_array() {
+    static constexpr const qnn::ggml_qnn_binary_op_t kQnnOpsTable[] = {
         nullptr,                         // GGML_OP_NONE
         nullptr,                         // GGML_OP_DUP
         qnn_binary_op_impl<GGML_OP_ADD>, // GGML_OP_ADD
         nullptr,                         // GGML_OP_ADD1
         nullptr,                         // GGML_OP_ACC
-        nullptr,                         // GGML_OP_SUB
+        qnn_binary_op_impl<GGML_OP_SUB>, // GGML_OP_SUB
         qnn_binary_op_impl<GGML_OP_MUL>, // GGML_OP_MUL
-        nullptr,                         // GGML_OP_DIV
+        qnn_binary_op_impl<GGML_OP_DIV>, // GGML_OP_DIV
         nullptr,                         // GGML_OP_SQR
         nullptr,                         // GGML_OP_SQRT
         nullptr,                         // GGML_OP_LOG
diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp
index 01c23ecff..8d94fc6c2 100644
--- a/ggml/src/ggml-qnn/backend-ops.hpp
+++ b/ggml/src/ggml-qnn/backend-ops.hpp
@@ -6,11 +6,14 @@
 
 namespace qnn {
 
-typedef void (*ggml_qnn_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst);
+typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst);
+typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                                     ggml_tensor *dst);
 
-typedef const ggml_qnn_op_t (&ggml_qnn_op_array_t)[GGML_OP_COUNT];
+typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT];
+typedef const ggml_qnn_binary_op_t (&ggml_qnn_binary_op_array_t)[GGML_OP_COUNT];
 
-ggml_qnn_op_array_t ggml_qnn_op_array();
+ggml_qnn_unary_op_array_t ggml_qnn_unary_op_array();
+ggml_qnn_binary_op_array_t ggml_qnn_binary_op_array();
 
 } // namespace qnn
diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp
index 48b243577..0ec927779 100644
--- a/ggml/src/ggml-qnn/backend.hpp
+++ b/ggml/src/ggml-qnn/backend.hpp
@@ -11,6 +11,11 @@
 #include "graph.hpp"
 #include "qnn.hpp"
 
+namespace qnn {
+typedef std::unordered_map<std::string, std::unique_ptr<qnn::ggml_qnn_graph_unary>> ggml_qnn_unary_graph_cache_t;
+typedef std::unordered_map<std::string, std::unique_ptr<qnn::ggml_qnn_graph_binary>> ggml_qnn_binary_graph_cache_t;
+} // namespace qnn
+
 struct ggml_backend_qnn_context {
     int device;
     int threads;
@@ -21,5 +26,6 @@ struct ggml_backend_qnn_context {
     QNN_INTERFACE_VER_TYPE raw_interface;
     QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
     qnn::qcom_socinfo socinfo;
-    std::unordered_map<std::string, std::unique_ptr<qnn::ggml_qnn_graph_binary>> qnn_binary_graph_cache;
+    qnn::ggml_qnn_unary_graph_cache_t qnn_unary_graph_cache;
+    qnn::ggml_qnn_binary_graph_cache_t qnn_binary_graph_cache;
 };