split qnn ops into file

2024-06-24 22:11:28 +08:00 · 2024-06-24 22:11:28 +08:00 · c9e99bd603
commit c9e99bd603
parent e1056da1c0
9 changed files with 889 additions and 845 deletions
--- a/ggml-qnn.cpp
+++ b/ggml-qnn.cpp
@ -1,22 +1,14 @@
 #include <stdio.h>
 #include <stdlib.h>
-#include <stdint.h>
 #include <stdatomic.h>
 #include <string.h>
-#include <stddef.h>
-#include <inttypes.h>
-#include <math.h>
 #include <time.h>
 #include <unistd.h>
-#include <dlfcn.h>
-#include <fcntl.h>
 #include <sys/stat.h>

-#include <string>
 #include <vector>
 #include <thread>
 #include <mutex>
-#include <map>
 #include <set>
 #include <tuple>
 #include <queue>
@ -28,7 +20,6 @@
 #include <regex>
 #include <random>
 #include <functional>
-#include <unordered_map>
 #include <condition_variable>
 #include <cassert>
 #include <unordered_set>
@ -40,8 +31,9 @@

 #include "ggml-qnn/logger.hpp"
 #include "ggml-qnn/utils.hpp"
-#include "ggml-qnn/backend.hpp"
 #include "ggml-qnn/tensor.hpp"
+#include "ggml-qnn/backend.hpp"
+#include "ggml-qnn/backend-ops.hpp"

 // =================================================================================================
 //
@ -63,11 +55,6 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor);

 #define QNN_BACKEND_NAME            "qnn"

-typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx,
-                                const ggml_tensor * src0,
-                                const ggml_tensor * src1,
-                                ggml_tensor * dst);
-
 static struct qnn::qcom_socinfo g_qnn_soc_info_table[] = {
        /* Qualcomm SnapDragon 8 Gen 1 */
        [qnn::SM8450] = {
@ -183,78 +170,6 @@ struct ggml_backend_qnn_buffer_type_context {
 //  QNN backend internal helper functions
 //
 // =================================================================================================
-static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
-                            const ggml_tensor * src1, ggml_tensor * dst) {
-    if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) {
-        QNN_LOG_WARN("invalid params\n");
-        return false;
-    }
-
-    qnn::qnn_instance *instance = nullptr;
-    Qnn_Tensor_t      *tensor_0 = nullptr;
-    Qnn_Tensor_t      *tensor_1 = nullptr;
-    Qnn_Tensor_t      *tensor_2 = nullptr;
-    tensor_0 = (Qnn_Tensor_t *) src0->extra;
-    tensor_1 = (Qnn_Tensor_t *) src1->extra;
-    tensor_2 = (Qnn_Tensor_t *) dst->extra;
-    instance = ctx->instance;
-    if ((nullptr == instance) || (nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) {
-        QNN_LOG_WARN("invalid params\n");
-        return false;
-    }
-
-    return true;
-}
-
-#ifndef NDEBUG
-#define CHECK_PARAMS(ctx, src0, src1, dst)                          \
-    do {                                                            \
-        if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) {   \
-            return;                                                 \
-        }                                                           \
-    } while (0)
-
-#else
-#define CHECK_PARAMS(ctx, src0, src1, dst)
-#endif
-
-#if ENABLE_QNNBACKEND_PERF
-class qnn_perf {
-public:
-    qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {};
-    qnn_perf() = delete;
-    qnn_perf(const qnn_perf & ) = delete;
-    qnn_perf & operator= (const qnn_perf & ) = delete;
-
-    void start() {
-        _begin_time = ggml_time_us();
-    }
-
-    void info() {
-        _end_time = ggml_time_us();
-        _duration = (_end_time - _begin_time);
-        QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
-    }
-
-private:
-    int64_t _begin_time = 0LL;
-    int64_t _end_time   = 0LL;
-    int64_t _duration   = 0LL;
-    std::string _perf_name;
-};
-#else
-class qnn_perf {
-public:
-    qnn_perf(const std::string & perf_name) {}
-    qnn_perf() = delete;
-    qnn_perf(const qnn_perf & ) = delete;
-    qnn_perf & operator= (const qnn_perf & ) = delete;
-
-    void start() {}
-    void info() {}
-};
-#endif
-
 static size_t memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) {
    if (!dst || !src || !dst_size || !copy_size) return 0;

@ -354,100 +269,10 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor) {
 //  implementation of QNN backend for GGML
 //
 // =================================================================================================
-static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
-    const ggml_tensor * src1, ggml_tensor * dst);
-static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
-
-static ggml_qnn_func_t s_op_table[GGML_OP_COUNT] = {
-    nullptr, // GGML_OP_NONE
-    nullptr, // GGML_OP_DUP
-    ggml_qnn_add, // GGML_OP_ADD
-    nullptr, // GGML_OP_ADD1
-    nullptr, // GGML_OP_ACC
-    nullptr, // GGML_OP_SUB
-    nullptr, // GGML_OP_MUL
-    nullptr, // GGML_OP_DIV
-    nullptr, // GGML_OP_SQR
-    nullptr, // GGML_OP_SQRT
-    nullptr, // GGML_OP_LOG
-    nullptr, // GGML_OP_SUM
-    nullptr, // GGML_OP_SUM_ROWS
-    nullptr, // GGML_OP_MEAN
-    nullptr, // GGML_OP_ARGMAX
-    nullptr, // GGML_OP_REPEAT
-    nullptr, // GGML_OP_REPEAT_BACK
-    nullptr, // GGML_OP_CONCAT
-    nullptr, // GGML_OP_SILU_BACK
-    nullptr, // GGML_OP_NORM
-    nullptr, // GGML_OP_RMS_NORM
-    nullptr, // GGML_OP_RMS_NORM_BACK
-    nullptr, // GGML_OP_GROUP_NORM
-
-    ggml_qnn_mul_mat, // GGML_OP_MUL_MAT
-    nullptr, // GGML_OP_MUL_MAT_ID
-    nullptr, // GGML_OP_OUT_PROD
-
-    nullptr, // GGML_OP_SCALE
-    nullptr, // GGML_OP_SET
-    nullptr, // GGML_OP_CPY
-    nullptr, // GGML_OP_CONT
-    nullptr, // GGML_OP_RESHAPE
-    nullptr, // GGML_OP_VIEW
-    nullptr, // GGML_OP_PERMUTE
-    nullptr, // GGML_OP_TRANSPOSE
-    nullptr, // GGML_OP_GET_ROWS
-    nullptr, // GGML_OP_GET_ROWS_BACK
-    nullptr, // GGML_OP_DIAG
-    nullptr, // GGML_OP_DIAG_MASK_INF
-    nullptr, // GGML_OP_DIAG_MASK_ZERO
-    nullptr, // GGML_OP_SOFT_MAX
-    nullptr, // GGML_OP_SOFT_MAX_BACK
-    nullptr, // GGML_OP_ROPE
-    nullptr, // GGML_OP_ROPE_BACK
-    nullptr, // GGML_OP_CLAMP
-    nullptr, // GGML_OP_CONV_TRANSPOSE_1D
-    nullptr, // GGML_OP_IM2COL
-    nullptr, // GGML_OP_CONV_TRANSPOSE_2D
-    nullptr, // GGML_OP_POOL_1D
-    nullptr, // GGML_OP_POOL_2D
-    nullptr, // GGML_OP_UPSCALE
-    nullptr, // GGML_OP_PAD
-    nullptr, // GGML_OP_ARANGE
-    nullptr, // GGML_OP_TIMESTEP_EMBEDDING
-    nullptr, // GGML_OP_ARGSORT
-    nullptr, // GGML_OP_LEAKY_RELU
-
-    nullptr, // GGML_OP_FLASH_ATTN_EXT
-    nullptr, // GGML_OP_FLASH_ATTN_BACK
-    nullptr, // GGML_OP_SSM_CONV
-    nullptr, // GGML_OP_SSM_SCAN
-    nullptr, // GGML_OP_WIN_PART
-    nullptr, // GGML_OP_WIN_UNPART
-    nullptr, // GGML_OP_GET_REL_POS
-    nullptr, // GGML_OP_ADD_REL_POS
-
-    nullptr, // GGML_OP_UNARY
-
-    nullptr, // GGML_OP_MAP_UNARY
-    nullptr, // GGML_OP_MAP_BINARY
-
-    nullptr, // GGML_OP_MAP_CUSTOM1_F32
-    nullptr, // GGML_OP_MAP_CUSTOM2_F32
-    nullptr, // GGML_OP_MAP_CUSTOM3_F32
-
-    nullptr, // GGML_OP_MAP_CUSTOM1
-    nullptr, // GGML_OP_MAP_CUSTOM2
-    nullptr, // GGML_OP_MAP_CUSTOM3
-
-    nullptr, // GGML_OP_CROSS_ENTROPY_LOSS
-    nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
-};
-
 static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx,
                                   const struct ggml_tensor * tensor,
                                   bool b_dump_tensor_info) {
-    if (ggml_is_empty(tensor) || !s_op_table[tensor->op]) {
+    if (ggml_is_empty(tensor) || !qnn::ggml_qnn_op_array()[tensor->op]) {
        return false;
    }

@ -496,550 +321,10 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx,
    return true;
 }

-
-//TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat
-//      keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC
-static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
-                         const ggml_tensor * src1, ggml_tensor * dst) {
-    Qnn_ErrorHandle_t  error             = QNN_SUCCESS;
-    bool               graph_initialized = false;
-    qnn::qnn_instance *instance          = nullptr;
-    std::string        graph_name        = "ggml_op_qnn_add";
-    Qnn_GraphHandle_t  graph_handle      = nullptr;
-    Qnn_Param_t        qnn_params[]      = {};
-    enum ggml_op       ggmlop            = GGML_OP_ADD;
-
-    CHECK_PARAMS(ctx, src0, src1, dst);
-    instance = ctx->instance;
-    auto qnn_raw_interface = ctx->raw_interface;
-
-    qnn_perf perf("ggml_qnn_add");
-    perf.start();
-
-    std::string map_entry = std::string(ggml_op_name(ggmlop));
-    if (instance->_qnn_graph_map.find(map_entry) !=
-        instance->_qnn_graph_map.end()) {
-        graph_initialized = true;
-        auto & graph_item = instance->_qnn_graph_map[map_entry];
-        graph_handle      = std::get<0>(graph_item);
-    }
-
-    if (!graph_initialized) {
-        graph_name = graph_name + "_" + std::to_string(ctx->threads) +
-                     "_" + src0->name + "_" + src1->name;
-        QNN_LOG_INFO("graph name %s", graph_name.c_str());
-        if (ctx->device == QNN_BACKEND_NPU) {
-            QnnHtpGraph_CustomConfig_t hvx_config;
-            hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
-            hvx_config.numHvxThreads = 8;
-            QnnGraph_Config_t graph_hvx_config;
-            graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-            graph_hvx_config.customConfig = &hvx_config;
-
-            QnnHtpGraph_CustomConfig_t dlbc_config;
-            dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
-            dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
-            dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
-            QnnGraph_Config_t graph_dlbc_config;
-            graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-            graph_dlbc_config.customConfig = &dlbc_config;
-
-            QnnHtpGraph_CustomConfig_t opt_config;
-            opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
-            opt_config.optimizationOption.floatValue = 1;    // 1 / 3
-            QnnGraph_Config_t graph_opt_config;
-            graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-            graph_opt_config.customConfig = &opt_config;
-
-            QnnHtpGraph_CustomConfig_t vtcm_config;
-            vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
-            vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb;
-            QnnGraph_Config_t graph_vtcm_config;
-            graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-            graph_vtcm_config.customConfig = &vtcm_config;
-
-            const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config,
-                                                         &graph_dlbc_config,
-                                                         &graph_vtcm_config,
-                                                         &graph_opt_config,
-                                                         NULL};
-            error = qnn_raw_interface.graphCreate(
-                    instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig,
-                    &graph_handle);
-        } else {
-            error = qnn_raw_interface.graphCreate(
-                    instance->get_qnn_context_handle(), graph_name.c_str(), nullptr,
-                    &graph_handle);
-        }
-
-        if (QNN_SUCCESS != error) {
-            QNN_LOG_INFO("can't create qnn graph handle with graph name %s, "
-                         "error = %d\n",
-                         graph_name.c_str(), error);
-            goto failure;
-        } else {
-            QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str());
-        }
-
-        qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx);
-        if (!tensor_input0.is_valid()) {
-            goto failure;
-        }
-        qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx);
-        if (!tensor_input1.is_valid()) {
-            QNN_LOG_INFO("error = %d\n", error);
-            goto failure;
-        }
-        qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx);
-        if (!tensor_output.is_valid()) {
-            goto failure;
-        }
-
-        Qnn_Tensor_t   tensor_inputs[]  = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()};
-        Qnn_Tensor_t   tensor_outputs[] = {*tensor_output.get_qnn_tensor()};
-        Qnn_OpConfig_t op_config        = {
-            (Qnn_OpConfigVersion_t) 1,
-            .v1 = {"ggml_op_add",
-                   QNN_OP_PACKAGE_NAME_QTI_AISW,
-                   QNN_OP_ELEMENT_WISE_ADD,
-                   0, qnn_params,
-                   2, tensor_inputs,
-                   1,tensor_outputs}
-        };
-        error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
-        if (QNN_SUCCESS != error) {
-            QNN_LOG_INFO("error = %d\n", error);
-            goto failure;
-        }
-        error = qnn_raw_interface.graphFinalize(graph_handle,
-                                                nullptr, nullptr);
-        if (QNN_SUCCESS != error) {
-            QNN_LOG_INFO("error = %d\n", error);
-            goto failure;
-        }
-        error = qnn_raw_interface.graphExecute(graph_handle,
-                                           tensor_inputs, 2,
-                                           tensor_outputs, 1,
-                                           nullptr, nullptr);
-        if (ctx->device == QNN_BACKEND_NPU) {
-            if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) {
-                QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
-            }
-        }
-        if (QNN_SUCCESS != error) {
-            QNN_LOG_INFO("error = %d\n", error);
-            goto failure;
-        }
-
-        auto graph_item = std::make_tuple(graph_handle, 
-                                          tensor_input0.get_qnn_tensor(), 
-                                          tensor_input1.get_qnn_tensor(), 
-                                          tensor_output.get_qnn_tensor());
-        instance->_qnn_graph_map[map_entry] = graph_item;
-    } else {
-        auto & graph_item = instance->_qnn_graph_map[map_entry];
-        qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx);
-        qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx);
-        qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx);
-
-        Qnn_Tensor_t tensor_inputs[]  = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()};
-        Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()};
-        error = qnn_raw_interface.graphExecute(graph_handle,
-                                           tensor_inputs,2,
-                                           tensor_outputs,1,
-                                           nullptr, nullptr);
-        if (ctx->device == QNN_BACKEND_NPU) {
-            if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) {
-                QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
-            }
-        }
-        if (QNN_SUCCESS != error) {
-            QNN_LOG_INFO("error = %d\n", error);
-            goto failure;
-        }
-    }
-
-failure:
-    if (QNN_SUCCESS != error) {
-        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
-                              " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-                      src0->name, src0->type, ggml_type_name(src0->type),
-                      src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0],
-                      src0->nb[1], src0->nb[2]);
-        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
-                              " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-                      src1->name, src1->type, ggml_type_name(src1->type),
-                      src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0],
-                      src1->nb[1], src1->nb[2]);
-        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
-                              " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-                      dst->name, dst->type, ggml_type_name(dst->type),
-                      dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
-                      dst->nb[1], dst->nb[2]);
-    }
-
-    perf.info();
-}
-
-/*
- * ggml_qnn_mul_mat was re-added as a standalone function because
- * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632
- * MUL_MAT take most of the compute time (about 95%).
- * So to speed up llama, we have to focus on MUL_MAT.
- *
- * We have three kinds of MUL_MAT to compute:
- * mul_mat_f32:     both src0 and src1 are F32.
- * mul_mat_f16_f32: src0 is F16 and src1 is F32.
- * mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32.
- */
-static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
-                             const ggml_tensor * src0, const ggml_tensor * src1,
-                             ggml_tensor * dst) {
-    Qnn_ErrorHandle_t  error              = QNN_SUCCESS;
-    bool               graph_initialized  = false;
-    qnn::qnn_instance *instance           = nullptr;
-    std::string        graph_name         = "ggml_op_qnn_mul_mat";
-    Qnn_GraphHandle_t  graph_handle       = nullptr;
-    Qnn_Param_t        qnn_params[]             = {};
-    enum ggml_op       ggmlop             = GGML_OP_MUL_MAT;
-
-    CHECK_PARAMS(ctx, src0, src1, dst);
-    instance = ctx->instance;
-    auto qnn_raw_interface = ctx->raw_interface;
-
-    qnn_perf perf("ggml_qnn_mul_mat");
-    perf.start();
-
-    std::string map_entry = std::string(ggml_op_name(ggmlop));
-    if (instance->_qnn_graph_map.find(map_entry) !=
-        instance->_qnn_graph_map.end()) {
-        graph_initialized = true;
-        auto & graph_item = instance->_qnn_graph_map[map_entry];
-        graph_handle      = std::get<0>(graph_item);
-    }
-
-    //TODO: for scenarios of quantized data in src0
-    //      pass-1: dequantize src0 to FP32
-    //      pass-2: dq-src0 * src1
-    //      the performance gains is worth although there is performance loss in pass-1
-
-    if (!graph_initialized) {
-        graph_name = graph_name + "_" + std::to_string(ctx->threads) +
-                     "_" + src0->name + "_" + src1->name;
-        QNN_LOG_INFO("graph name %s", graph_name.c_str());
-        if (ctx->device == QNN_BACKEND_NPU) {
-            QnnHtpGraph_CustomConfig_t hvx_config;
-            hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
-            hvx_config.numHvxThreads = 8;
-            QnnGraph_Config_t graph_hvx_config;
-            graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-            graph_hvx_config.customConfig = &hvx_config;
-
-            QnnHtpGraph_CustomConfig_t dlbc_config;
-            dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
-            dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
-            dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
-            QnnGraph_Config_t graph_dlbc_config;
-            graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-            graph_dlbc_config.customConfig = &dlbc_config;
-
-            QnnHtpGraph_CustomConfig_t opt_config;
-            opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
-            opt_config.optimizationOption.floatValue = 1; //1 / 3
-            QnnGraph_Config_t graph_opt_config;
-            graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-            graph_opt_config.customConfig = &opt_config;
-
-            QnnHtpGraph_CustomConfig_t vtcm_config;
-            vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
-            vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb;
-            QnnGraph_Config_t graph_vtcm_config;
-            graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-            graph_vtcm_config.customConfig = &vtcm_config;
-
-            const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config,
-                                                         &graph_dlbc_config,
-                                                         &graph_vtcm_config,
-                                                         &graph_opt_config,
-                                                         NULL};
-            error = qnn_raw_interface.graphCreate(
-                    instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig,
-                    &graph_handle);
-        } else {
-            error = qnn_raw_interface.graphCreate(
-                    instance->get_qnn_context_handle(), graph_name.c_str(), nullptr,
-                    &graph_handle);
-        }
-        if (QNN_SUCCESS != error) {
-            QNN_LOG_INFO("can't create qnn graph handle with graph name %s, "
-                         "error = %d\n",
-                         graph_name.c_str(), error);
-            goto failure;
-        }
-
-        qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx);
-        if (!tensor_input0.is_valid()) {
-            goto failure;
-        }
-        qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx);
-        if (!tensor_input1.is_valid()) {
-            goto failure;
-        }
-        qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx);
-        if (!tensor_output.is_valid()) {
-            goto failure;
-        }
-
-        Qnn_Tensor_t   tensor_inputs[]  = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()};
-        Qnn_Tensor_t   tensor_outputs[] = {*tensor_output.get_qnn_tensor()};
-        Qnn_OpConfig_t op_config = {
-                (Qnn_OpConfigVersion_t) 1,
-                .v1 = {"ggml_op_mul_mat",
-                       QNN_OP_PACKAGE_NAME_QTI_AISW,
-                       QNN_OP_MAT_MUL,
-                       0, qnn_params,
-                       2, tensor_inputs,
-                       1, tensor_outputs}
-        };
-        error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
-        if (QNN_SUCCESS != error) {
-            QNN_LOG_INFO("error = %d\n", error);
-            goto failure;
-        }
-        error = qnn_raw_interface.graphFinalize(graph_handle,
-                                                nullptr, nullptr);
-        if (QNN_SUCCESS != error) {
-            QNN_LOG_INFO("error = %d\n", error);
-            goto failure;
-        }
-        error = qnn_raw_interface.graphExecute(graph_handle,
-                                           tensor_inputs, 2,
-                                           tensor_outputs, 1,
-                                           nullptr, nullptr);
-        if (ctx->device == QNN_BACKEND_NPU) {
-            if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) {
-                QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
-            }
-        }
-        if (QNN_SUCCESS != error) {
-            QNN_LOG_INFO("error = %d\n", error);
-            goto failure;
-        }
-
-        auto graph_item = std::make_tuple(graph_handle, 
-                                          tensor_input0.get_qnn_tensor(), 
-                                          tensor_input1.get_qnn_tensor(), 
-                                          tensor_output.get_qnn_tensor());
-        instance->_qnn_graph_map[map_entry] = graph_item;
-    } else {
-        auto & graph_item= instance->_qnn_graph_map[map_entry];
-        qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx);
-        qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx);
-        qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx);
-
-        Qnn_Tensor_t tensor_inputs[]  = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()};
-        Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()};
-        error = qnn_raw_interface.graphExecute(graph_handle,
-                                           tensor_inputs, 2,
-                                           tensor_outputs, 1,
-                                           nullptr, nullptr);
-        if (ctx->device == QNN_BACKEND_NPU) {
-            if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) {
-                QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
-            }
-        }
-        if (QNN_SUCCESS != error) {
-            QNN_LOG_INFO("error = %d\n", error);
-            goto failure;
-        }
-    }
-
-failure:
-    if (QNN_SUCCESS != error) {
-        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
-                              " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-                      src0->name, src0->type, ggml_type_name(src0->type),
-                      src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0],
-                      src0->nb[1], src0->nb[2]);
-        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
-                              " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-                      src1->name, src1->type, ggml_type_name(src1->type),
-                      src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0],
-                      src1->nb[1], src1->nb[2]);
-        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
-                              " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-                      dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0],
-                      dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]);
-    }
-
-    perf.info();
-}
-
-static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx,
-                            const ggml_tensor * src0, const ggml_tensor * src1,
-                            ggml_tensor * dst) {
-}
-
-static void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx,
-                              const ggml_tensor * src0, const ggml_tensor * src1,
-                              ggml_tensor * dst) {
-}
-
-static void ggml_qnn_acc(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
-                         const ggml_tensor * src1, ggml_tensor * dst) {
-}
-
-static void ggml_qnn_div(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
-                         const ggml_tensor * src1, ggml_tensor * dst) {
-}
-
-static void ggml_qnn_gelu(ggml_backend_qnn_context * ctx,
-                          const ggml_tensor * src0, const ggml_tensor * src1,
-                          ggml_tensor * dst) {
-}
-
-static void ggml_qnn_silu(ggml_backend_qnn_context * ctx,
-                          const ggml_tensor * src0, const ggml_tensor * src1,
-                          ggml_tensor * dst) {
-}
-
-static void ggml_qnn_gelu_quick(ggml_backend_qnn_context * ctx,
-                                const ggml_tensor * src0,
-                                const ggml_tensor * src1, ggml_tensor * dst) {
-}
-
-static void ggml_qnn_tanh(ggml_backend_qnn_context * ctx,
-                          const ggml_tensor * src0, const ggml_tensor * src1,
-                          ggml_tensor * dst) {
-}
-
-static void ggml_qnn_relu(ggml_backend_qnn_context * ctx,
-                          const ggml_tensor * src0, const ggml_tensor * src1,
-                          ggml_tensor * dst) {
-}
-
-static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context * ctx,
-                                 const ggml_tensor * src0,
-                                 const ggml_tensor * src1, ggml_tensor * dst) {
-}
-
-static void ggml_qnn_hardswish(ggml_backend_qnn_context * ctx,
-                               const ggml_tensor * src0, const ggml_tensor * src1,
-                               ggml_tensor * dst) {
-}
-
-static void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx,
-                                const ggml_tensor * src0,
-                                const ggml_tensor * src1, ggml_tensor * dst) {
-}
-
-static void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
-                         const ggml_tensor * src1, ggml_tensor * dst) {
-}
-
-static void ggml_qnn_norm(ggml_backend_qnn_context * ctx,
-                          const ggml_tensor * src0, const ggml_tensor * src1,
-                          ggml_tensor * dst) {
-}
-
-static void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx,
-                                const ggml_tensor * src0,
-                                const ggml_tensor * src1, ggml_tensor * dst) {
-}
-
-static void ggml_qnn_concat(ggml_backend_qnn_context * ctx,
-                            const ggml_tensor * src0, const ggml_tensor * src1,
-                            ggml_tensor * dst) {
-}
-
-static void ggml_qnn_upscale(ggml_backend_qnn_context * ctx,
-                             const ggml_tensor * src0, const ggml_tensor * src1,
-                             ggml_tensor * dst) {
-}
-
-static void ggml_qnn_pad(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
-                         const ggml_tensor * src1, ggml_tensor * dst) {
-}
-
-static void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx,
-                              const ggml_tensor * src0, const ggml_tensor * src1,
-                              ggml_tensor * dst) {
-}
-
-static void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
-                         const ggml_tensor * src1, ggml_tensor * dst) {
-}
-
-static void ggml_qnn_dup(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
-                         const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_qnn_cpy(ctx, src0, dst, nullptr);
-    (void) src1;
-}
-
-static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context * ctx,
-                                const ggml_tensor * src0,
-                                const ggml_tensor * src1, ggml_tensor * dst) {
-}
-
-static void ggml_qnn_scale(ggml_backend_qnn_context * ctx,
-                           const ggml_tensor * src0, const ggml_tensor * src1,
-                           ggml_tensor * dst) {
-}
-
-static void ggml_qnn_clamp(ggml_backend_qnn_context * ctx,
-                           const ggml_tensor * src0, const ggml_tensor * src1,
-                           ggml_tensor * dst) {
-}
-
-static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context * ctx,
-                                   const ggml_tensor * src0,
-                                   const ggml_tensor * src1, ggml_tensor * dst) {
-}
-
-static void ggml_qnn_soft_max(ggml_backend_qnn_context * ctx,
-                              const ggml_tensor * src0, const ggml_tensor * src1,
-                              ggml_tensor * dst) {
-}
-
-static void ggml_qnn_rope(ggml_backend_qnn_context * ctx,
-                          const ggml_tensor * src0, const ggml_tensor * src1,
-                          ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-}
-
-static void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx,
-                            const ggml_tensor * src0, const ggml_tensor * src1,
-                            ggml_tensor * dst) {
-}
-
-static void ggml_qnn_im2col(ggml_backend_qnn_context * ctx,
-                            const ggml_tensor * src0, const ggml_tensor * src1,
-                            ggml_tensor * dst) {
-}
-
-static void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx,
-                              const ggml_tensor * src0, const ggml_tensor * src1,
-                              ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-}
-
-static void ggml_qnn_argsort(ggml_backend_qnn_context * ctx,
-                             const ggml_tensor * src0, const ggml_tensor * src1,
-                             ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-}
-
-static void ggml_qnn_nop(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
-                         const ggml_tensor * src1, ggml_tensor * dst) {
-    (void)src0;
-    (void)src1;
-    (void)dst;
-}
-
 bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx,
                              struct ggml_compute_params * params,
                              struct ggml_tensor * tensor) {
-    ggml_qnn_func_t func = s_op_table[tensor->op];
+    auto func = qnn::ggml_qnn_op_array()[tensor->op];
    if (!func) {
        QNN_LOG_WARN("unsupported op %d", tensor->op);
        return false;
--- a/ggml-qnn/backend-ops.cpp
+++ b/ggml-qnn/backend-ops.cpp
@ -0,0 +1,675 @@
+
+#include "backend-ops.hpp"
+
+#include "utils.hpp"
+#include "logger.hpp"
+#include "tensor.hpp"
+
+
+static bool qnn_is_valid_params(ggml_backend_qnn_context* ctx, const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst) {
+    if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) {
+        QNN_LOG_WARN("invalid params\n");
+        return false;
+    }
+
+    qnn::qnn_instance* instance = nullptr;
+    Qnn_Tensor_t* tensor_0 = nullptr;
+    Qnn_Tensor_t* tensor_1 = nullptr;
+    Qnn_Tensor_t* tensor_2 = nullptr;
+    tensor_0 = (Qnn_Tensor_t*)src0->extra;
+    tensor_1 = (Qnn_Tensor_t*)src1->extra;
+    tensor_2 = (Qnn_Tensor_t*)dst->extra;
+    instance = ctx->instance;
+    if ((nullptr == instance) || (nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) {
+        QNN_LOG_WARN("invalid params\n");
+        return false;
+    }
+
+    return true;
+}
+
+#ifndef NDEBUG
+#define CHECK_PARAMS(ctx, src0, src1, dst)                          \
+    do {                                                            \
+        if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) {   \
+            return;                                                 \
+        }                                                           \
+    } while (0)
+
+#else
+#define CHECK_PARAMS(ctx, src0, src1, dst)
+#endif
+
+//TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat
+//      keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC
+static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst) {
+    Qnn_ErrorHandle_t  error = QNN_SUCCESS;
+    bool               graph_initialized = false;
+    qnn::qnn_instance* instance = nullptr;
+    std::string        graph_name = "ggml_op_qnn_add";
+    Qnn_GraphHandle_t  graph_handle = nullptr;
+    Qnn_Param_t        qnn_params[] = {};
+    enum ggml_op       ggmlop = GGML_OP_ADD;
+
+    CHECK_PARAMS(ctx, src0, src1, dst);
+    instance = ctx->instance;
+    auto qnn_raw_interface = ctx->raw_interface;
+
+    qnn::qnn_perf perf("ggml_qnn_add");
+    perf.start();
+
+    std::string map_entry = std::string(ggml_op_name(ggmlop));
+    if (instance->_qnn_graph_map.find(map_entry) !=
+        instance->_qnn_graph_map.end()) {
+        graph_initialized = true;
+        auto& graph_item = instance->_qnn_graph_map[map_entry];
+        graph_handle = std::get<0>(graph_item);
+    }
+
+    if (!graph_initialized) {
+        graph_name = graph_name + "_" + std::to_string(ctx->threads) +
+            "_" + src0->name + "_" + src1->name;
+        QNN_LOG_INFO("graph name %s", graph_name.c_str());
+        if (ctx->device == QNN_BACKEND_NPU) {
+            QnnHtpGraph_CustomConfig_t hvx_config;
+            hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
+            hvx_config.numHvxThreads = 8;
+            QnnGraph_Config_t graph_hvx_config;
+            graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+            graph_hvx_config.customConfig = &hvx_config;
+
+            QnnHtpGraph_CustomConfig_t dlbc_config;
+            dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
+            dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
+            dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
+            QnnGraph_Config_t graph_dlbc_config;
+            graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+            graph_dlbc_config.customConfig = &dlbc_config;
+
+            QnnHtpGraph_CustomConfig_t opt_config;
+            opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
+            opt_config.optimizationOption.floatValue = 1;    // 1 / 3
+            QnnGraph_Config_t graph_opt_config;
+            graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+            graph_opt_config.customConfig = &opt_config;
+
+            QnnHtpGraph_CustomConfig_t vtcm_config;
+            vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
+            vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb;
+            QnnGraph_Config_t graph_vtcm_config;
+            graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+            graph_vtcm_config.customConfig = &vtcm_config;
+
+            const QnnGraph_Config_t* p_graphconfig[] = { &graph_hvx_config,
+                                                         &graph_dlbc_config,
+                                                         &graph_vtcm_config,
+                                                         &graph_opt_config,
+                                                         NULL };
+            error = qnn_raw_interface.graphCreate(
+                instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig,
+                &graph_handle);
+        }
+        else {
+            error = qnn_raw_interface.graphCreate(
+                instance->get_qnn_context_handle(), graph_name.c_str(), nullptr,
+                &graph_handle);
+        }
+
+        if (QNN_SUCCESS != error) {
+            QNN_LOG_INFO("can't create qnn graph handle with graph name %s, "
+                "error = %d\n",
+                graph_name.c_str(), error);
+            goto failure;
+        }
+        else {
+            QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str());
+        }
+
+        qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx);
+        if (!tensor_input0.is_valid()) {
+            goto failure;
+        }
+        qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx);
+        if (!tensor_input1.is_valid()) {
+            QNN_LOG_INFO("error = %d\n", error);
+            goto failure;
+        }
+        qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx);
+        if (!tensor_output.is_valid()) {
+            goto failure;
+        }
+
+        Qnn_Tensor_t   tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() };
+        Qnn_Tensor_t   tensor_outputs[] = { *tensor_output.get_qnn_tensor() };
+        Qnn_OpConfig_t op_config = {
+            (Qnn_OpConfigVersion_t)1,
+            .v1 = {"ggml_op_add",
+                   QNN_OP_PACKAGE_NAME_QTI_AISW,
+                   QNN_OP_ELEMENT_WISE_ADD,
+                   0, qnn_params,
+                   2, tensor_inputs,
+                   1,tensor_outputs}
+        };
+        error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
+        if (QNN_SUCCESS != error) {
+            QNN_LOG_INFO("error = %d\n", error);
+            goto failure;
+        }
+        error = qnn_raw_interface.graphFinalize(graph_handle,
+            nullptr, nullptr);
+        if (QNN_SUCCESS != error) {
+            QNN_LOG_INFO("error = %d\n", error);
+            goto failure;
+        }
+        error = qnn_raw_interface.graphExecute(graph_handle,
+            tensor_inputs, 2,
+            tensor_outputs, 1,
+            nullptr, nullptr);
+        if (ctx->device == QNN_BACKEND_NPU) {
+            if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) {
+                QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
+            }
+        }
+        if (QNN_SUCCESS != error) {
+            QNN_LOG_INFO("error = %d\n", error);
+            goto failure;
+        }
+
+        auto graph_item = std::make_tuple(graph_handle,
+            tensor_input0.get_qnn_tensor(),
+            tensor_input1.get_qnn_tensor(),
+            tensor_output.get_qnn_tensor());
+        instance->_qnn_graph_map[map_entry] = graph_item;
+    }
+    else {
+        auto& graph_item = instance->_qnn_graph_map[map_entry];
+        qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx);
+        qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx);
+        qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx);
+
+        Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() };
+        Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() };
+        error = qnn_raw_interface.graphExecute(graph_handle,
+            tensor_inputs, 2,
+            tensor_outputs, 1,
+            nullptr, nullptr);
+        if (ctx->device == QNN_BACKEND_NPU) {
+            if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) {
+                QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
+            }
+        }
+        if (QNN_SUCCESS != error) {
+            QNN_LOG_INFO("error = %d\n", error);
+            goto failure;
+        }
+    }
+
+failure:
+    if (QNN_SUCCESS != error) {
+        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
+            " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+            src0->name, src0->type, ggml_type_name(src0->type),
+            src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0],
+            src0->nb[1], src0->nb[2]);
+        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
+            " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+            src1->name, src1->type, ggml_type_name(src1->type),
+            src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0],
+            src1->nb[1], src1->nb[2]);
+        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
+            " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+            dst->name, dst->type, ggml_type_name(dst->type),
+            dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
+            dst->nb[1], dst->nb[2]);
+    }
+
+    perf.info();
+}
+
+/*
+ * ggml_qnn_mul_mat was re-added as a standalone function because
+ * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632
+ * MUL_MAT take most of the compute time (about 95%).
+ * So to speed up llama, we have to focus on MUL_MAT.
+ *
+ * We have three kinds of MUL_MAT to compute:
+ * mul_mat_f32:     both src0 and src1 are F32.
+ * mul_mat_f16_f32: src0 is F16 and src1 is F32.
+ * mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32.
+ */
+static void ggml_qnn_mul_mat(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst) {
+    Qnn_ErrorHandle_t  error = QNN_SUCCESS;
+    bool               graph_initialized = false;
+    qnn::qnn_instance* instance = nullptr;
+    std::string        graph_name = "ggml_op_qnn_mul_mat";
+    Qnn_GraphHandle_t  graph_handle = nullptr;
+    Qnn_Param_t        qnn_params[] = {};
+    enum ggml_op       ggmlop = GGML_OP_MUL_MAT;
+
+    CHECK_PARAMS(ctx, src0, src1, dst);
+    instance = ctx->instance;
+    auto qnn_raw_interface = ctx->raw_interface;
+
+    qnn::qnn_perf perf("ggml_qnn_mul_mat");
+    perf.start();
+
+    std::string map_entry = std::string(ggml_op_name(ggmlop));
+    if (instance->_qnn_graph_map.find(map_entry) !=
+        instance->_qnn_graph_map.end()) {
+        graph_initialized = true;
+        auto& graph_item = instance->_qnn_graph_map[map_entry];
+        graph_handle = std::get<0>(graph_item);
+    }
+
+    //TODO: for scenarios of quantized data in src0
+    //      pass-1: dequantize src0 to FP32
+    //      pass-2: dq-src0 * src1
+    //      the performance gains is worth although there is performance loss in pass-1
+
+    if (!graph_initialized) {
+        graph_name = graph_name + "_" + std::to_string(ctx->threads) +
+            "_" + src0->name + "_" + src1->name;
+        QNN_LOG_INFO("graph name %s", graph_name.c_str());
+        if (ctx->device == QNN_BACKEND_NPU) {
+            QnnHtpGraph_CustomConfig_t hvx_config;
+            hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
+            hvx_config.numHvxThreads = 8;
+            QnnGraph_Config_t graph_hvx_config;
+            graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+            graph_hvx_config.customConfig = &hvx_config;
+
+            QnnHtpGraph_CustomConfig_t dlbc_config;
+            dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
+            dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
+            dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
+            QnnGraph_Config_t graph_dlbc_config;
+            graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+            graph_dlbc_config.customConfig = &dlbc_config;
+
+            QnnHtpGraph_CustomConfig_t opt_config;
+            opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
+            opt_config.optimizationOption.floatValue = 1; //1 / 3
+            QnnGraph_Config_t graph_opt_config;
+            graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+            graph_opt_config.customConfig = &opt_config;
+
+            QnnHtpGraph_CustomConfig_t vtcm_config;
+            vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
+            vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb;
+            QnnGraph_Config_t graph_vtcm_config;
+            graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+            graph_vtcm_config.customConfig = &vtcm_config;
+
+            const QnnGraph_Config_t* p_graphconfig[] = { &graph_hvx_config,
+                                                         &graph_dlbc_config,
+                                                         &graph_vtcm_config,
+                                                         &graph_opt_config,
+                                                         NULL };
+            error = qnn_raw_interface.graphCreate(
+                instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig,
+                &graph_handle);
+        }
+        else {
+            error = qnn_raw_interface.graphCreate(
+                instance->get_qnn_context_handle(), graph_name.c_str(), nullptr,
+                &graph_handle);
+        }
+        if (QNN_SUCCESS != error) {
+            QNN_LOG_INFO("can't create qnn graph handle with graph name %s, "
+                "error = %d\n",
+                graph_name.c_str(), error);
+            goto failure;
+        }
+
+        qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx);
+        if (!tensor_input0.is_valid()) {
+            goto failure;
+        }
+        qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx);
+        if (!tensor_input1.is_valid()) {
+            goto failure;
+        }
+        qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx);
+        if (!tensor_output.is_valid()) {
+            goto failure;
+        }
+
+        Qnn_Tensor_t   tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() };
+        Qnn_Tensor_t   tensor_outputs[] = { *tensor_output.get_qnn_tensor() };
+        Qnn_OpConfig_t op_config = {
+                (Qnn_OpConfigVersion_t)1,
+                .v1 = {"ggml_op_mul_mat",
+                       QNN_OP_PACKAGE_NAME_QTI_AISW,
+                       QNN_OP_MAT_MUL,
+                       0, qnn_params,
+                       2, tensor_inputs,
+                       1, tensor_outputs}
+        };
+        error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
+        if (QNN_SUCCESS != error) {
+            QNN_LOG_INFO("error = %d\n", error);
+            goto failure;
+        }
+        error = qnn_raw_interface.graphFinalize(graph_handle,
+            nullptr, nullptr);
+        if (QNN_SUCCESS != error) {
+            QNN_LOG_INFO("error = %d\n", error);
+            goto failure;
+        }
+        error = qnn_raw_interface.graphExecute(graph_handle,
+            tensor_inputs, 2,
+            tensor_outputs, 1,
+            nullptr, nullptr);
+        if (ctx->device == QNN_BACKEND_NPU) {
+            if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) {
+                QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
+            }
+        }
+        if (QNN_SUCCESS != error) {
+            QNN_LOG_INFO("error = %d\n", error);
+            goto failure;
+        }
+
+        auto graph_item = std::make_tuple(graph_handle,
+            tensor_input0.get_qnn_tensor(),
+            tensor_input1.get_qnn_tensor(),
+            tensor_output.get_qnn_tensor());
+        instance->_qnn_graph_map[map_entry] = graph_item;
+    }
+    else {
+        auto& graph_item = instance->_qnn_graph_map[map_entry];
+        qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx);
+        qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx);
+        qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx);
+
+        Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() };
+        Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() };
+        error = qnn_raw_interface.graphExecute(graph_handle,
+            tensor_inputs, 2,
+            tensor_outputs, 1,
+            nullptr, nullptr);
+        if (ctx->device == QNN_BACKEND_NPU) {
+            if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) {
+                QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
+            }
+        }
+        if (QNN_SUCCESS != error) {
+            QNN_LOG_INFO("error = %d\n", error);
+            goto failure;
+        }
+    }
+
+failure:
+    if (QNN_SUCCESS != error) {
+        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
+            " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+            src0->name, src0->type, ggml_type_name(src0->type),
+            src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0],
+            src0->nb[1], src0->nb[2]);
+        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
+            " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+            src1->name, src1->type, ggml_type_name(src1->type),
+            src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0],
+            src1->nb[1], src1->nb[2]);
+        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
+            " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+            dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0],
+            dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]);
+    }
+
+    perf.info();
+}
+
+static void ggml_qnn_repeat(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst) {
+}
+
+static void ggml_qnn_get_rows(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst) {
+}
+
+static void ggml_qnn_acc(ggml_backend_qnn_context* ctx, const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst) {
+}
+
+static void ggml_qnn_div(ggml_backend_qnn_context* ctx, const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst) {
+}
+
+static void ggml_qnn_gelu(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst) {
+}
+
+static void ggml_qnn_silu(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst) {
+}
+
+static void ggml_qnn_gelu_quick(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst) {
+}
+
+static void ggml_qnn_tanh(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst) {
+}
+
+static void ggml_qnn_relu(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst) {
+}
+
+static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst) {
+}
+
+static void ggml_qnn_hardswish(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst) {
+}
+
+static void ggml_qnn_leaky_relu(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst) {
+}
+
+static void ggml_qnn_sqr(ggml_backend_qnn_context* ctx, const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst) {
+}
+
+static void ggml_qnn_norm(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst) {
+}
+
+static void ggml_qnn_group_norm(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst) {
+}
+
+static void ggml_qnn_concat(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst) {
+}
+
+static void ggml_qnn_upscale(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst) {
+}
+
+static void ggml_qnn_pad(ggml_backend_qnn_context* ctx, const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst) {
+}
+
+static void ggml_qnn_rms_norm(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst) {
+}
+
+static void ggml_qnn_cpy(ggml_backend_qnn_context* ctx, const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst) {
+}
+
+static void ggml_qnn_dup(ggml_backend_qnn_context* ctx, const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst) {
+    ggml_qnn_cpy(ctx, src0, dst, nullptr);
+    (void)src1;
+}
+
+static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst) {
+}
+
+static void ggml_qnn_scale(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst) {
+}
+
+static void ggml_qnn_clamp(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst) {
+}
+
+static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst) {
+}
+
+static void ggml_qnn_soft_max(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst) {
+}
+
+static void ggml_qnn_rope(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0));
+}
+
+static void ggml_qnn_pool2d(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst) {
+}
+
+static void ggml_qnn_im2col(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst) {
+}
+
+static void ggml_qnn_sum_rows(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0));
+}
+
+static void ggml_qnn_argsort(ggml_backend_qnn_context* ctx,
+    const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0));
+}
+
+static void ggml_qnn_nop(ggml_backend_qnn_context* ctx, const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst) {
+    (void)src0;
+    (void)src1;
+    (void)dst;
+}
+
+qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() {
+    static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[GGML_OP_COUNT] = {
+        nullptr, // GGML_OP_NONE
+        nullptr, // GGML_OP_DUP
+        ggml_qnn_add, // GGML_OP_ADD
+        nullptr, // GGML_OP_ADD1
+        nullptr, // GGML_OP_ACC
+        nullptr, // GGML_OP_SUB
+        nullptr, // GGML_OP_MUL
+        nullptr, // GGML_OP_DIV
+        nullptr, // GGML_OP_SQR
+        nullptr, // GGML_OP_SQRT
+        nullptr, // GGML_OP_LOG
+        nullptr, // GGML_OP_SUM
+        nullptr, // GGML_OP_SUM_ROWS
+        nullptr, // GGML_OP_MEAN
+        nullptr, // GGML_OP_ARGMAX
+        nullptr, // GGML_OP_REPEAT
+        nullptr, // GGML_OP_REPEAT_BACK
+        nullptr, // GGML_OP_CONCAT
+        nullptr, // GGML_OP_SILU_BACK
+        nullptr, // GGML_OP_NORM
+        nullptr, // GGML_OP_RMS_NORM
+        nullptr, // GGML_OP_RMS_NORM_BACK
+        nullptr, // GGML_OP_GROUP_NORM
+
+        ggml_qnn_mul_mat, // GGML_OP_MUL_MAT
+        nullptr, // GGML_OP_MUL_MAT_ID
+        nullptr, // GGML_OP_OUT_PROD
+
+        nullptr, // GGML_OP_SCALE
+        nullptr, // GGML_OP_SET
+        nullptr, // GGML_OP_CPY
+        nullptr, // GGML_OP_CONT
+        nullptr, // GGML_OP_RESHAPE
+        nullptr, // GGML_OP_VIEW
+        nullptr, // GGML_OP_PERMUTE
+        nullptr, // GGML_OP_TRANSPOSE
+        nullptr, // GGML_OP_GET_ROWS
+        nullptr, // GGML_OP_GET_ROWS_BACK
+        nullptr, // GGML_OP_DIAG
+        nullptr, // GGML_OP_DIAG_MASK_INF
+        nullptr, // GGML_OP_DIAG_MASK_ZERO
+        nullptr, // GGML_OP_SOFT_MAX
+        nullptr, // GGML_OP_SOFT_MAX_BACK
+        nullptr, // GGML_OP_ROPE
+        nullptr, // GGML_OP_ROPE_BACK
+        nullptr, // GGML_OP_CLAMP
+        nullptr, // GGML_OP_CONV_TRANSPOSE_1D
+        nullptr, // GGML_OP_IM2COL
+        nullptr, // GGML_OP_CONV_TRANSPOSE_2D
+        nullptr, // GGML_OP_POOL_1D
+        nullptr, // GGML_OP_POOL_2D
+        nullptr, // GGML_OP_UPSCALE
+        nullptr, // GGML_OP_PAD
+        nullptr, // GGML_OP_ARANGE
+        nullptr, // GGML_OP_TIMESTEP_EMBEDDING
+        nullptr, // GGML_OP_ARGSORT
+        nullptr, // GGML_OP_LEAKY_RELU
+
+        nullptr, // GGML_OP_FLASH_ATTN_EXT
+        nullptr, // GGML_OP_FLASH_ATTN_BACK
+        nullptr, // GGML_OP_SSM_CONV
+        nullptr, // GGML_OP_SSM_SCAN
+        nullptr, // GGML_OP_WIN_PART
+        nullptr, // GGML_OP_WIN_UNPART
+        nullptr, // GGML_OP_GET_REL_POS
+        nullptr, // GGML_OP_ADD_REL_POS
+
+        nullptr, // GGML_OP_UNARY
+
+        nullptr, // GGML_OP_MAP_UNARY
+        nullptr, // GGML_OP_MAP_BINARY
+
+        nullptr, // GGML_OP_MAP_CUSTOM1_F32
+        nullptr, // GGML_OP_MAP_CUSTOM2_F32
+        nullptr, // GGML_OP_MAP_CUSTOM3_F32
+
+        nullptr, // GGML_OP_MAP_CUSTOM1
+        nullptr, // GGML_OP_MAP_CUSTOM2
+        nullptr, // GGML_OP_MAP_CUSTOM3
+
+        nullptr, // GGML_OP_CROSS_ENTROPY_LOSS
+        nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
+    };
+
+    return kQnnOpsTable;
+}
--- a/ggml-qnn/backend-ops.hpp
+++ b/ggml-qnn/backend-ops.hpp
@ -0,0 +1,17 @@
+#pragma once
+
+#include "ggml.h"
+#include "backend.hpp"
+
+namespace qnn {
+
+    typedef void (*ggml_qnn_op_t)(ggml_backend_qnn_context* ctx,
+        const ggml_tensor* src0,
+        const ggml_tensor* src1,
+        ggml_tensor* dst);
+
+    typedef const ggml_qnn_op_t(&ggml_qnn_op_array_t)[GGML_OP_COUNT];
+
+    ggml_qnn_op_array_t ggml_qnn_op_array();
+
+}
--- a/ggml-qnn/backend.hpp
+++ b/ggml-qnn/backend.hpp
@ -1,11 +1,6 @@

 #pragma once

-#include "QnnTypes.h"
-#include "QnnCommon.h"
-#include "QnnContext.h"
-#include "QnnBackend.h"
-
 #include "ggml.h"
 #include "ggml-backend.h"

--- a/ggml-qnn/qnn.hpp
+++ b/ggml-qnn/qnn.hpp
@ -1,21 +1,27 @@
 #pragma once

+#include <math.h>
 #include <mutex>
+#include <string>
+#include <unordered_map>
+#include <map>

 // header file of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK
 // https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct
 #include "QnnTypes.h"
 #include "QnnCommon.h"
+#include "QnnInterface.h"
 #include "QnnContext.h"
 #include "QnnBackend.h"
 #include "QnnGraph.h"
 #include "QnnProperty.h"
 #include "QnnTensor.h"
+#include "System/QnnSystemInterface.h"
 #include "HTP/QnnHtpDevice.h"
 #include "HTP/QnnHtpGraph.h"

+#include "qnn-types.hpp"
 #include "utils.hpp"
-#include "logger.hpp"

 namespace qnn {

@ -864,9 +870,8 @@ namespace qnn {
        const qnn::qcom_socinfo& get_soc_info() { return _soc_info; }

    public:
-        std::map<std::string, std::tuple<Qnn_GraphHandle_t, Qnn_Tensor_t*,
-            Qnn_Tensor_t*, Qnn_Tensor_t*>>
-            _qnn_graph_map;
+        std::map<std::string,
+            std::tuple<Qnn_GraphHandle_t, Qnn_Tensor_t*, Qnn_Tensor_t*, Qnn_Tensor_t*>> _qnn_graph_map;

    private:
        int load_system() {
--- a/ggml-qnn/tensor.hpp
+++ b/ggml-qnn/tensor.hpp
@ -4,6 +4,7 @@
 #include "QnnTensor.h"
 #include "System/QnnSystemInterface.h"

+#include "ggml-qnn.h"
 #include "backend.hpp"
 #include "qnn.hpp"

--- a/ggml-qnn/utils.cpp
+++ b/ggml-qnn/utils.cpp
@ -0,0 +1,126 @@
+
+#include "utils.hpp"
+
+#include "ggml-qnn.h"
+#include "qnn-types.hpp"
+
+namespace qnn {
+
+    // TODO: mapping more ggml data type to QNN data type
+    // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
+    Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype) {
+        switch (ggmltype) {
+        case GGML_TYPE_F16:
+            return QNN_DATATYPE_FLOAT_16;
+        case GGML_TYPE_F32:
+            return QNN_DATATYPE_FLOAT_32;
+        case GGML_TYPE_I8:
+            return QNN_DATATYPE_INT_8;
+        case GGML_TYPE_Q8_0:
+            return QNN_DATATYPE_SFIXED_POINT_8;
+        case GGML_TYPE_Q4_0:
+            return QNN_DATATYPE_SFIXED_POINT_4;
+        default:
+            break;
+        }
+        return QNN_DATATYPE_UNDEFINED;
+    }
+
+
+    uint32_t get_ggml_tensor_rank(const ggml_tensor* tensor) {
+        uint32_t rank = 0;
+        for (int i = 0; i < GGML_MAX_DIMS; i++) {
+            if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) {
+                rank++;
+            }
+        }
+        return rank;
+    }
+
+
+    const char* get_backend_name(int n_backend_type) {
+        switch (n_backend_type) {
+        case QNN_BACKEND_CPU:
+            return "QNN-CPU";
+        case QNN_BACKEND_GPU:
+            return "QNN-GPU";
+        case QNN_BACKEND_NPU:
+            return "QNN-NPU";
+        case QNN_BACKEND_GGML:
+            return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML
+        default:
+            return "unknown";
+        }
+    }
+
+    const char* get_chipset_desc(uint32_t chipset_id) {
+        switch (chipset_id) {
+        case SM8450:
+            return "SM8450";
+        case SM8475:
+            return "SM8475";
+        case SM8550:
+            return "SM8550";
+        case SM8650:
+            return "SM8650";
+        default:
+            return "unknown";
+        }
+    }
+
+    const char* get_htparch_desc(size_t htp_arch) {
+        switch (htp_arch) {
+        case V68:
+            return "QCOM_HTP_V68";
+        case V69:
+            return "QCOM_HTP_V69";
+        case V73:
+            return "QCOM_HTP_V73";
+        case V75:
+            return "QCOM_HTP_V75";
+        default:
+            return "unknown";
+        }
+    }
+
+    intptr_t align_to(size_t alignment, intptr_t offset) {
+        return offset % alignment == 0
+            ? offset
+            : offset + (static_cast<intptr_t>(alignment) -
+                offset % static_cast<intptr_t>(alignment));
+    }
+
+    uint32_t get_ggml_tensor_data_size(const ggml_tensor* tensor) {
+        /*
+        size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]);
+        size_t n_dims = qnn_get_ggml_tensor_rank(tensor);
+        for (int i = 1; i < n_dims; i++) {
+            data_size *= tensor->ne[i];
+        }
+
+        return data_size;
+        */
+        return ggml_nbytes(tensor);
+    }
+
+    // =================================================================================================
+    //
+    //  QNN backend internal helper functions
+    //
+    // =================================================================================================
+    // TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT
+    const char* opname_from_ggmlop(enum ggml_op ggmlop) {
+        switch (ggmlop) {
+        case GGML_OP_ADD:
+            return QNN_OP_ELEMENT_WISE_ADD;
+        case GGML_OP_MUL:
+            return QNN_OP_ELEMENT_WISE_MULTIPLY;
+        case GGML_OP_MUL_MAT:
+            return QNN_OP_MAT_MUL;
+        default:
+            break;
+        }
+        return nullptr;
+    }
+
+}
--- a/ggml-qnn/utils.hpp
+++ b/ggml-qnn/utils.hpp
@ -1,135 +1,34 @@
 #pragma once

+#include <stdint.h>
+#include <stddef.h>
+#include <inttypes.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <string>
+
 #include "QnnTypes.h"

 #include "ggml.h"

-#include "qnn-types.hpp"
+#include "logger.hpp"

 namespace qnn {

-    // TODO: mapping more ggml data type to QNN data type
-    // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
-    Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype) {
-        switch (ggmltype) {
-        case GGML_TYPE_F16:
-            return QNN_DATATYPE_FLOAT_16;
-        case GGML_TYPE_F32:
-            return QNN_DATATYPE_FLOAT_32;
-        case GGML_TYPE_I8:
-            return QNN_DATATYPE_INT_8;
-        case GGML_TYPE_Q8_0:
-            return QNN_DATATYPE_SFIXED_POINT_8;
-        case GGML_TYPE_Q4_0:
-            return QNN_DATATYPE_SFIXED_POINT_4;
-        default:
-            break;
-        }
-        return QNN_DATATYPE_UNDEFINED;
-    }
+    Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype);
+    uint32_t get_ggml_tensor_rank(const ggml_tensor* tensor);
+    const char* get_backend_name(int n_backend_type);
+    const char* get_chipset_desc(uint32_t chipset_id);
+    const char* get_htparch_desc(size_t htp_arch);
+    intptr_t align_to(size_t alignment, intptr_t offset);
+    uint32_t get_ggml_tensor_data_size(const ggml_tensor* tensor);

-
-    uint32_t get_ggml_tensor_rank(const ggml_tensor* tensor) {
-        uint32_t rank = 0;
-        for (int i = 0; i < GGML_MAX_DIMS; i++) {
-            if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) {
-                rank++;
-            }
-        }
-        return rank;
-    }
-
-
-    const char* get_backend_name(int n_backend_type) {
-        switch (n_backend_type) {
-        case QNN_BACKEND_CPU:
-            return "QNN-CPU";
-        case QNN_BACKEND_GPU:
-            return "QNN-GPU";
-        case QNN_BACKEND_NPU:
-            return "QNN-NPU";
-        case QNN_BACKEND_GGML:
-            return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML
-        default:
-            return "unknown";
-        }
-    }
-
-    const char* get_chipset_desc(uint32_t chipset_id) {
-        switch (chipset_id) {
-        case SM8450:
-            return "SM8450";
-        case SM8475:
-            return "SM8475";
-        case SM8550:
-            return "SM8550";
-        case SM8650:
-            return "SM8650";
-        default:
-            return "unknown";
-        }
-    }
-
-    const char* get_htparch_desc(size_t htp_arch) {
-        switch (htp_arch) {
-        case V68:
-            return "QCOM_HTP_V68";
-        case V69:
-            return "QCOM_HTP_V69";
-        case V73:
-            return "QCOM_HTP_V73";
-        case V75:
-            return "QCOM_HTP_V75";
-        default:
-            return "unknown";
-        }
-    }
+    const char* opname_from_ggmlop(enum ggml_op ggmlop);

    template <typename Fn> Fn load_qnn_functionpointers(void* handle, const char* function_name) {
        return reinterpret_cast<Fn>(dlsym(handle, function_name));
    }

-    intptr_t align_to(size_t alignment, intptr_t offset) {
-        return offset % alignment == 0
-            ? offset
-            : offset + (static_cast<intptr_t>(alignment) -
-                offset % static_cast<intptr_t>(alignment));
-    }
-
-    uint32_t get_ggml_tensor_data_size(const ggml_tensor* tensor) {
-        /*
-        size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]);
-        size_t n_dims = qnn_get_ggml_tensor_rank(tensor);
-        for (int i = 1; i < n_dims; i++) {
-            data_size *= tensor->ne[i];
-        }
-
-        return data_size;
-        */
-        return ggml_nbytes(tensor);
-    }
-
-
-    // =================================================================================================
-    //
-    //  QNN backend internal helper functions
-    //
-    // =================================================================================================
-    // TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT
-    const char* opname_from_ggmlop(enum ggml_op ggmlop) {
-        switch (ggmlop) {
-        case GGML_OP_ADD:
-            return QNN_OP_ELEMENT_WISE_ADD;
-        case GGML_OP_MUL:
-            return QNN_OP_ELEMENT_WISE_MULTIPLY;
-        case GGML_OP_MUL_MAT:
-            return QNN_OP_MAT_MUL;
-        default:
-            break;
-        }
-        return nullptr;
-    }
-
    inline int validate_tensor_version(Qnn_Tensor_t tensor) {
        if (tensor.version != QNN_TENSOR_VERSION_1) {
            QNN_LOG_WARN(
@ -272,6 +171,45 @@ namespace qnn {
            tensor.v1.memHandle = handle;
        }
    }
+
+
+#if ENABLE_QNNBACKEND_PERF
+    class qnn_perf {
+    public:
+        qnn_perf(const std::string& perf_name) : _perf_name(std::move(perf_name)) {};
+        qnn_perf() = delete;
+        qnn_perf(const qnn_perf&) = delete;
+        qnn_perf& operator= (const qnn_perf&) = delete;
+
+        void start() {
+            _begin_time = ggml_time_us();
+        }
+
+        void info() {
+            _end_time = ggml_time_us();
+            _duration = (_end_time - _begin_time);
+            QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
+        }
+
+    private:
+        int64_t _begin_time = 0LL;
+        int64_t _end_time = 0LL;
+        int64_t _duration = 0LL;
+        std::string _perf_name;
+    };
+#else
+    class qnn_perf {
+    public:
+        qnn_perf(const std::string& perf_name) {}
+        qnn_perf() = delete;
+        qnn_perf(const qnn_perf&) = delete;
+        qnn_perf& operator= (const qnn_perf&) = delete;
+
+        void start() {}
+        void info() {}
+    };
+#endif
+
 }


--- a/tests/ggml-qnn/CMakeLists.txt
+++ b/tests/ggml-qnn/CMakeLists.txt
@ -21,6 +21,8 @@ set(SOURCE_FILES
        ../../ggml-backend.c
        ../../ggml-quants.c
        ../../ggml-qnn/logger.cpp
+        ../../ggml-qnn/utils.cpp
+        ../../ggml-qnn/backend-ops.cpp
        ../../ggml-qnn.cpp
        ggml-qnn-ut.cpp
 )