review: make a MVP(Minimum Viable PR) style PR in upstream

2024-06-13 15:41:53 +08:00 · 2024-06-13 15:41:53 +08:00 · 5598fbd15d
commit 5598fbd15d
parent faaa86b7e4
3 changed files with 185 additions and 443 deletions
--- a/ggml-qnn.cpp
+++ b/ggml-qnn.cpp
@ -55,7 +55,7 @@
 #include "Saver/QnnSaver.h"
 #include "System/QnnSystemInterface.h"
 #include "HTP/QnnHtpDevice.h"
-#include <HTP/QnnHtpGraph.h>
+#include "HTP/QnnHtpGraph.h"
 // =================================================================================================
 //
@ -91,12 +91,6 @@ typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx,
                                const ggml_tensor * src1,
                                ggml_tensor * dst);
 typedef void (*ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx,
                                       const ggml_op              ggml_op,
                                       const ggml_tensor *        src0,
                                       const ggml_tensor *        src1,
                                       ggml_tensor *              dst);
 enum qcom_htp_arch {
    NONE = 0,
    V68 = 68,
@ -424,6 +418,7 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso
    return true;
 }
 #ifndef NDEBUG
 #define CHECK_PARAMS(ctx, src0, src1, dst)                          \
    do {                                                            \
        if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) {   \
@ -431,6 +426,10 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso
        }                                                           \
    } while (0)
 #else
 #define CHECK_PARAMS(ctx, src0, src1, dst)
 #endif
 #if ENABLE_QNNBACKEND_PERF
 class qnn_perf {
 public:
@ -446,7 +445,7 @@ public:
    void info() {
        _end_time = ggml_time_us();
        _duration = (_end_time - _begin_time);
-        QNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
+        QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
    }
 private:
@ -809,7 +808,7 @@ static void qnn_sdk_logcallback(const char * fmt, QnnLog_Level_t level,
        memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN);
        vsnprintf(reinterpret_cast<char *const>(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp);
-        QNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf);
+        QNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf);
    }
 #endif
 }
@ -1069,7 +1068,7 @@ class qnn_instance {
            arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
            arch_devconfig.customConfig = &arch_customconfig;
-            const QnnDevice_Config_t * p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, NULL};
+            const QnnDevice_Config_t * p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, nullptr};
            qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle);
        } else {
            qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle);
@ -1137,10 +1136,14 @@ class qnn_instance {
            _pfn_rpc_mem_init();
        }
-        std::vector<const QnnContext_Config_t *> temp_context_config;
+        /* TODO: not used, keep it for further usage
                 QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT;
                 qnn_context_config.priority = QNN_PRIORITY_DEFAULT;
                 const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr};
        */
        _qnn_interface.qnn_context_create(
                _qnn_backend_handle, _qnn_device_handle,
-                temp_context_config.empty() ? nullptr : temp_context_config.data(),
+                nullptr,
                &_qnn_context_handle);
        if (nullptr == _qnn_context_handle) {
            QNN_LOG_WARN("why failed to initialize qnn context\n");
@ -1157,9 +1160,11 @@ class qnn_instance {
            size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048};
            size_t probe_counts  = sizeof(probe_slots) / sizeof(size_t);
            for (size_t idx = 0; idx < probe_counts; idx++) {
-                rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem(probe_slots[idx] * size_in_mb, 4));
+                rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem(
                        probe_slots[idx] * size_in_mb, 4));
                if (nullptr == rpc_buffer) {
-                    QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
+                    QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n",
                              probe_slots[idx], strerror(errno));
                    break;
                } else {
                    candidate_size = probe_slots[idx];
@ -1262,8 +1267,8 @@ class qnn_instance {
        return ret_status;
    }
-    //keep it for further usage of offload the entire cgraph to a single QNN DAG directly
+    //TODO:keep it for further usage of offload the entire cgraph to a single QNN DAG directly
-    //which was used in Qualcomm's dedicated AI technology
+    //     which was used in Qualcomm's dedicated AI technology
 #if 0
    int init_qnn_graph(const char * graph_name, bool debug,
                                     uint8_t do_node_validation = true,
@ -1430,13 +1435,14 @@ class qnn_instance {
        QnnHtpPerfInfrastructure_PowerConfig_t power_config;
        memset(&power_config, 0, sizeof(power_config));
        power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
-        power_config.dcvsV3Config.dcvsEnable    = 0;
+
        power_config.dcvsV3Config.setDcvsEnable = 1;
        power_config.dcvsV3Config.dcvsEnable    = 0;
        power_config.dcvsV3Config.contextId     = _qnn_power_configid;
        power_config.dcvsV3Config.powerMode     = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
        power_config.dcvsV3Config.setSleepLatency =
            1; // true to consider Latency parameter otherwise false
-        power_config.dcvsV3Config.sleepLatency = 10;
+        power_config.dcvsV3Config.sleepLatency = 40;
        power_config.dcvsV3Config.setBusParams =
            1; // true to consider Bus parameter otherwise false
        power_config.dcvsV3Config.setCoreParams =
@ -1459,6 +1465,7 @@ class qnn_instance {
            DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
        power_config.dcvsV3Config.coreVoltageCornerMax =
            DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
        // set power config with different performance parameters
        const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {
            &power_config, nullptr};
@ -1550,6 +1557,7 @@ class qnn_instance {
            QNN_LOG_WARN("rpc memory already allocated\n");
            return 3;
        }
        if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) {
            QNN_LOG_WARN("tensor %s has been registered shared memory\n",
                         (QNN_VER_PTR(*p_tensor)->name));
@ -1710,7 +1718,7 @@ class qnn_instance {
        int result = 0;
        if (nullptr == _system_lib_handle) {
-            QNN_LOG_DEBUG("system lib handle is null\n");
+            QNN_LOG_WARN("system lib handle is null\n");
            return 1;
        }
@ -1724,8 +1732,7 @@ class qnn_instance {
        int dlclose_error = dlclose(_system_lib_handle);
        if (dlclose_error != 0) {
-            QNN_LOG_WARN("failed to close QnnSystem library, error %s\n",
+            QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror());
                         dlerror());
            return 2;
        }
@ -1740,8 +1747,7 @@ class qnn_instance {
        void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
        if (nullptr == lib_handle) {
-            QNN_LOG_WARN("can not open QNN library %s, with error: %s",
+            QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror());
                         lib_path.c_str(), dlerror());
            return 1;
        }
@ -1749,8 +1755,7 @@ class qnn_instance {
                load_qnn_functionpointers<pfn_qnninterface_getproviders *>(
                        lib_handle, "QnnInterface_getProviders");
        if (nullptr == get_providers) {
-            QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s",
+            QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror());
                         dlerror());
            return 2;
        }
@ -1758,14 +1763,12 @@ class qnn_instance {
        const QnnInterface_t ** provider_list = nullptr;
        error = get_providers(&provider_list, &num_providers);
        if (error != QNN_SUCCESS) {
-            QNN_LOG_WARN("failed to get providers, error %d",
+            QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error));
                         QNN_GET_ERROR_CODE(error));
            return 3;
        }
        QNN_LOG_DEBUG("num_providers=%d\n", num_providers);
        if (num_providers != _required_num_providers) {
-            QNN_LOG_WARN("providers is %d instead of required %d", num_providers,
+            QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers);
                         _required_num_providers);
            return 4;
        }
@ -1797,16 +1800,14 @@ class qnn_instance {
        BackendIdType backend_id          = provider_list[0]->backendId;
        _lib_path_to_backend_id[lib_path] = backend_id;
        if (_loaded_backend.count(backend_id) > 0) {
-            QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n",
+            QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id);
                         lib_path.c_str(), backend_id);
        }
        _loaded_backend[backend_id] = provider_list[0];
        if (_loaded_lib_handle.count(backend_id) > 0) {
            QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]);
            int dlclose_error = dlclose(_loaded_lib_handle[backend_id]);
            if (dlclose_error != 0) {
-                QNN_LOG_WARN("fail to close %p with error %s\n",
+                QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror());
                             _loaded_lib_handle[backend_id], dlerror());
            }
        }
        _loaded_lib_handle[backend_id] = lib_handle;
@ -1820,8 +1821,7 @@ class qnn_instance {
        for (auto & it : _loaded_lib_handle) {
            dlclose_error = dlclose(it.second);
            if (dlclose_error != 0) {
-                QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first,
+                QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror());
                             dlerror());
            }
        }
@ -1924,7 +1924,6 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx,
    const int64_t ne01 = src0->ne[1];
    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
    // make qnn_get_ggml_tensor_rank and QNN SDK happy
    if (ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1) {
        return false;
@ -1932,13 +1931,13 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx,
    // TODO: support other GGML OPs using QNN API
    // a GENERAL approach could fix this problem in a standalone PR of refine ggml backend
-    // subsystem for mixed inference between CPU&GPU / CPU&NPU easily for ANY ggml backends
+    // subsystem for hybrid inference between CPU&GPU / CPU&NPU easily(less the 100 LoC and no
-    // which the backend's ggml_backend_xxx_buffer_is_host return true.
+    // side-effect to the existing codes) for ANY ggml backends which the backend's
-    // this approach could be found:
+    // ggml_backend_xxx_buffer_is_host return true. this approach could be found at:
    // https://github.com/ggerganov/llama.cpp/pull/7641
    bool supported_op = false;
    supported_op = (tensor->op == GGML_OP_ADD);
-    supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT));
+    supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL_MAT));
    if (!supported_op) {
        return false;
    }
@ -1950,14 +1949,9 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx,
        }
    }
    int qtype = src0->type;
    if (tensor->op == GGML_OP_MUL) {
        return (qtype == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32);
    }
    if (tensor->op == GGML_OP_MUL_MAT) {
        if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) {
-            //make mul_mat with QNN RPC happy
+            //comment it for make UT of mul_mat with QNN RPC happy
            //return false;
        }
    }
@ -1965,6 +1959,8 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx,
    return true;
 }
 //TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat
 //      keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC
 static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
                         const ggml_tensor * src1, ggml_tensor * dst) {
    Qnn_ErrorHandle_t error             = QNN_SUCCESS;
@ -1986,10 +1982,11 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
    tensor_1 = (Qnn_Tensor_t *) src1->extra;
    tensor_2 = (Qnn_Tensor_t *) dst->extra;
    instance = ctx->instance;
    QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
    qnn_perf perf("ggml_qnn_add");
    perf.start();
    QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
    QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
    QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
    QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
@ -2034,17 +2031,31 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
            QnnHtpGraph_CustomConfig_t dlbc_config;
            dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
            /*
            dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
-            dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC
+            dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
             */
            dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
            QnnGraph_Config_t graph_dlbc_config;
            graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
            graph_dlbc_config.customConfig = &dlbc_config;
-            const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL};
+            QnnHtpGraph_CustomConfig_t opt_config;
            opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
            opt_config.optimizationOption.floatValue = 1;    // 1 / 3
            QnnGraph_Config_t graph_opt_config;
            graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
            graph_opt_config.customConfig = &opt_config;
            QnnHtpGraph_CustomConfig_t vtcm_config;
            vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
            vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb;
            QnnGraph_Config_t graph_vtcm_config;
            graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
            graph_vtcm_config.customConfig = &vtcm_config;
            const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config,
                                                         &graph_dlbc_config,
                                                         &graph_vtcm_config,
                                                         &graph_opt_config,
                                                         NULL};
            error = qnn_raw_interface.graphCreate(
                    instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig,
                    &graph_handle);
@ -2113,27 +2124,33 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
            uint8_t * qnn_buffer_2 = nullptr;
            qnn_instance * instance = ctx->instance;
-            qnn_buffer_0 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src0), 4));
+            qnn_buffer_0 = static_cast<uint8_t *>(instance->alloc_rpcmem(
                    ggml_nbytes(src0), 4));
            if (nullptr == qnn_buffer_0) {
                QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
                goto failure;
            } else {
                QNN_LOG_INFO("alloc rpcmem successfully\n");
            }
            instance->register_rpcmem(qnn_buffer_0, tensor_0);
            memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
-            qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src1), 4));
+            qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(
                    ggml_nbytes(src1), 4));
            if (nullptr == qnn_buffer_1) {
                QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
                goto failure;
            } else {
                QNN_LOG_INFO("alloc rpcmem successfully\n");
            }
            instance->register_rpcmem(qnn_buffer_1, tensor_1);
            memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
-            qnn_buffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(dst), 4));
+            qnn_buffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(
                    ggml_nbytes(dst), 4));
            if (nullptr == qnn_buffer_2) {
                QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
                goto failure;
            } else {
                QNN_LOG_INFO("alloc rpcmem successfully\n");
            }
@ -2144,23 +2161,33 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
        Qnn_Tensor_t   tensor_outputs[] = {*tensor_2};
        Qnn_OpConfig_t op_config        = {
            (Qnn_OpConfigVersion_t) 1,
-            .v1 = {"ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW,
+            .v1 = {"ggml_op_add",
-                   QNN_OP_ELEMENT_WISE_ADD, 0, qnn_params,
+                   QNN_OP_PACKAGE_NAME_QTI_AISW,
-                   2, tensor_inputs, 1,
+                   QNN_OP_ELEMENT_WISE_ADD,
-                   tensor_outputs}};
+                   0, qnn_params,
                   2, tensor_inputs,
                   1,tensor_outputs}
        };
        error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
        if (QNN_SUCCESS != error) {
            QNN_LOG_INFO("error = %d\n", error);
            goto failure;
        }
-        error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr);
+        error = qnn_raw_interface.graphFinalize(graph_handle,
                                                nullptr, nullptr);
        if (QNN_SUCCESS != error) {
            QNN_LOG_INFO("error = %d\n", error);
            goto failure;
        }
-        error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2,
+        error = qnn_raw_interface.graphExecute(graph_handle,
                                           tensor_inputs, 2,
                                           tensor_outputs, 1,
                                           nullptr, nullptr);
        if (ctx->device == QNN_BACKEND_NPU) {
            if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) {
                QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
            }
        }
        if (QNN_SUCCESS != error) {
            QNN_LOG_INFO("error = %d\n", error);
            goto failure;
@ -2221,9 +2248,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
        Qnn_Tensor_t tensor_inputs[]  = {*tensor_0, *tensor_1};
        Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
-        error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2,
+        error = qnn_raw_interface.graphExecute(graph_handle,
                                           tensor_inputs,2,
                                           tensor_outputs,1,
                                           nullptr, nullptr);
        if (ctx->device == QNN_BACKEND_NPU) {
            if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) {
                QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
            }
        }
        if (QNN_SUCCESS != error) {
            QNN_LOG_INFO("error = %d\n", error);
            goto failure;
@ -2299,6 +2332,8 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
    tensor_1 = (Qnn_Tensor_t *) src1->extra;
    tensor_2 = (Qnn_Tensor_t *) dst->extra;
    instance = ctx->instance;
    QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
    qnn_perf perf("ggml_qnn_mul_mat");
    perf.start();
@ -2307,7 +2342,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
    tensor_2 = (Qnn_Tensor_t *) dst->extra;
    instance = ctx->instance;
    QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
    QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
    QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
    QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
@ -2338,6 +2372,11 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions;
    //TODO: for scenarios of quantized data in src0
    //      pass-1: dequantize src0 to FP32
    //      pass-2: dq-src0 * src1
    //      the performance gains is worth although there is performance loss in pass-1
    if (!graph_initialized) {
        graph_name = graph_name + "_" + std::to_string(ctx->threads) +
                     "_" + src0->name + "_" + src1->name;
@ -2352,17 +2391,31 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
            QnnHtpGraph_CustomConfig_t dlbc_config;
            dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
            /*
            dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
-            dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC
+            dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
             */
            dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
            QnnGraph_Config_t graph_dlbc_config;
            graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
            graph_dlbc_config.customConfig = &dlbc_config;
-            const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL};
+            QnnHtpGraph_CustomConfig_t opt_config;
            opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
            opt_config.optimizationOption.floatValue = 1; //1 / 3
            QnnGraph_Config_t graph_opt_config;
            graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
            graph_opt_config.customConfig = &opt_config;
            QnnHtpGraph_CustomConfig_t vtcm_config;
            vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
            vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb;
            QnnGraph_Config_t graph_vtcm_config;
            graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
            graph_vtcm_config.customConfig = &vtcm_config;
            const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config,
                                                         &graph_dlbc_config,
                                                         &graph_vtcm_config,
                                                         &graph_opt_config,
                                                         NULL};
            error = qnn_raw_interface.graphCreate(
                    instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig,
                    &graph_handle);
@ -2428,27 +2481,33 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
            uint8_t * qnn_buffer_2 = nullptr;
            qnn_instance * instance = ctx->instance;
-            qnn_buffer_0 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src0), 4));
+            qnn_buffer_0 = static_cast<uint8_t *>(instance->alloc_rpcmem(
                    ggml_nbytes(src0), 4));
            if (nullptr == qnn_buffer_0) {
                QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
                goto  failure;
            } else {
                QNN_LOG_INFO("alloc rpcmem successfully\n");
            }
            instance->register_rpcmem(qnn_buffer_0, tensor_0);
            memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
-            qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src1), 4));
+            qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(
                    ggml_nbytes(src1), 4));
            if (nullptr == qnn_buffer_1) {
                QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
                goto  failure;
            } else {
                QNN_LOG_INFO("alloc rpcmem successfully\n");
            }
            instance->register_rpcmem(qnn_buffer_1, tensor_1);
            memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
-            qnn_buffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(dst), 4));
+            qnn_buffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(
                    ggml_nbytes(dst), 4));
            if (nullptr == qnn_buffer_2) {
                QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
                goto  failure;
            } else {
                QNN_LOG_INFO("alloc rpcmem successfully\n");
            }
@ -2457,17 +2516,22 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
        Qnn_Tensor_t   tensor_inputs[]  = {*tensor_0, *tensor_1};
        Qnn_Tensor_t   tensor_outputs[] = {*tensor_2};
-        Qnn_OpConfig_t op_config        = {(Qnn_OpConfigVersion_t) 1,
+        Qnn_OpConfig_t op_config = {
                (Qnn_OpConfigVersion_t) 1,
                .v1 = {"ggml_op_mul_mat",
                       QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                           QNN_OP_MAT_MUL, 0, qnn_params, 2,
+                       QNN_OP_MAT_MUL,
-                                           tensor_inputs, 1, tensor_outputs}};
+                       0, qnn_params,
                       2, tensor_inputs,
                       1, tensor_outputs}
        };
        error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
        if (QNN_SUCCESS != error) {
            QNN_LOG_INFO("error = %d\n", error);
            goto failure;
        }
-        error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr);
+        error = qnn_raw_interface.graphFinalize(graph_handle,
                                                nullptr, nullptr);
        if (QNN_SUCCESS != error) {
            QNN_LOG_INFO("error = %d\n", error);
            goto failure;
@ -2476,6 +2540,11 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
                                           tensor_inputs, 2,
                                           tensor_outputs, 1,
                                           nullptr, nullptr);
        if (ctx->device == QNN_BACKEND_NPU) {
            if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) {
                QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
            }
        }
        if (QNN_SUCCESS != error) {
            QNN_LOG_INFO("error = %d\n", error);
            goto failure;
@ -2540,6 +2609,11 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
                                           tensor_inputs, 2,
                                           tensor_outputs, 1,
                                           nullptr, nullptr);
        if (ctx->device == QNN_BACKEND_NPU) {
            if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) {
                QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
            }
        }
        if (QNN_SUCCESS != error) {
            QNN_LOG_INFO("error = %d\n", error);
            goto failure;
@ -2580,299 +2654,6 @@ failure:
    perf.info();
 }
 // common function for GGML OPs using QNN API
 static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx,
                               const enum ggml_op        ggmlop,
                               const ggml_tensor * src0, const ggml_tensor * src1,
                               ggml_tensor * dst) {
    Qnn_ErrorHandle_t error              = QNN_SUCCESS;
    bool              graph_initialized  = false;
    qnn_instance *    instance           = nullptr;
    std::string       qnn_graph_name     = "ggml_qnn_graph";
    std::string       qnn_op_config_name = "ggml_qnn_op_config";
    const char *      qnn_op_name        = nullptr;
    Qnn_GraphHandle_t graph_handle       = nullptr;
    Qnn_Tensor_t *    tensor_0           = nullptr;
    Qnn_Tensor_t *    tensor_1           = nullptr;
    Qnn_Tensor_t *    tensor_2           = nullptr;
    Qnn_Param_t qnn_params[] = {};
    Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32;
    Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32;
    Qnn_DataType_t dst_qnn_type  = QNN_DATATYPE_FLOAT_32;
    CHECK_PARAMS(ctx, src0, src1, dst);
    tensor_0 = (Qnn_Tensor_t *) src0->extra;
    tensor_1 = (Qnn_Tensor_t *) src1->extra;
    tensor_2 = (Qnn_Tensor_t *) dst->extra;
    instance = ctx->instance;
    qnn_perf perf(ggml_op_name(ggmlop));
    perf.start();
    qnn_op_name   = qnn_opname_from_ggmlop(ggmlop);
    if (nullptr == qnn_op_name) {
        QNN_LOG_WARN("ggml op %d(%s) not supported currently", ggmlop, ggml_op_name(ggmlop));
        return;
    }
    tensor_0 = (Qnn_Tensor_t *) src0->extra;
    tensor_1 = (Qnn_Tensor_t *) src1->extra;
    tensor_2 = (Qnn_Tensor_t *) dst->extra;
    instance = ctx->instance;
    QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
    src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type);
    src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type);
    dst_qnn_type  = qnn_datatype_from_ggml_datatype(dst->type);
    QNN_VER_PTR(*tensor_0)->type  = QNN_TENSOR_TYPE_APP_WRITE;
    QNN_VER_PTR(*tensor_1)->type  = QNN_TENSOR_TYPE_APP_WRITE;
    QNN_VER_PTR(*tensor_2)->type  = QNN_TENSOR_TYPE_APP_READ;
    uint32_t dimensions_input_0[] = {
        (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2],
        (uint32_t) src0->ne[3]};
    uint32_t dimensions_input_1[] = {
        (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2],
        (uint32_t) src1->ne[3]};
    uint32_t dimensions_output[] = {
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],
        (uint32_t) dst->ne[3]};
    std::string map_entry = std::string(ggml_op_name(ggmlop));
    if (instance->_qnn_graph_map.find(map_entry) !=
        instance->_qnn_graph_map.end()) {
        graph_initialized = true;
        auto & graph_item = instance->_qnn_graph_map[map_entry];
        graph_handle      = std::get<0>(graph_item);
    }
    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions;
    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions;
    if (!graph_initialized) {
        qnn_graph_name = qnn_graph_name + "_" + ggml_op_name(ggmlop) +
                         std::to_string(ctx->threads) + src0->name + "_" +
                         src1->name;
        qnn_op_config_name = qnn_op_config_name + "_" + ggml_op_name(ggmlop) +
                             std::to_string(ctx->threads) + src0->name + "_" +
                             src1->name;
        QNN_LOG_DEBUG("qnn graph name %s", qnn_graph_name.c_str());
        QNN_LOG_DEBUG("qnn op_config name %s", qnn_op_config_name.c_str());
        error = qnn_raw_interface.graphCreate(
            instance->get_qnn_context_handle(), qnn_graph_name.c_str(), nullptr,
            &graph_handle);
        if (QNN_SUCCESS != error) {
            QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph "
                         "name %s, error = %d\n",
                         ggml_op_name(ggmlop), qnn_graph_name.c_str(), error);
            goto failure;
        }
        if (ctx->device == QNN_BACKEND_NPU) {
            QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
            QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0};
            QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
            QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0};
            QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
            QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0};
        }
        error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0);
        if (QNN_SUCCESS != error) {
            QNN_LOG_INFO("error = %d\n", error);
            goto failure;
        }
        error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1);
        if (QNN_SUCCESS != error) {
            QNN_LOG_INFO("error = %d\n", error);
            goto failure;
        }
        error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2);
        if (QNN_SUCCESS != error) {
            QNN_LOG_INFO("error = %d\n", error);
            goto failure;
        }
        QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
        QNN_VER_PTR(*tensor_0)->rank       = qnn_get_ggml_tensor_rank(src0);
        QNN_VER_PTR(*tensor_0)->dataType   = src0_qnn_type;
        QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
        QNN_VER_PTR(*tensor_1)->rank       = qnn_get_ggml_tensor_rank(src1);
        QNN_VER_PTR(*tensor_1)->dataType   = src1_qnn_type;
        QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output;
        QNN_VER_PTR(*tensor_2)->rank       = qnn_get_ggml_tensor_rank(dst);
        QNN_VER_PTR(*tensor_2)->dataType   = dst_qnn_type;
        if (ctx->device != QNN_BACKEND_NPU) {
            QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data,
                                                 qnn_get_ggml_tensor_data_size(src0)};
            QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
                                                 qnn_get_ggml_tensor_data_size(src1)};
            QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
                                                 qnn_get_ggml_tensor_data_size(dst)};
        } else {
            uint8_t * qnn_buffer_0 = nullptr;
            uint8_t * qnn_buffer_1 = nullptr;
            uint8_t * qnn_buffer_2 = nullptr;
            qnn_instance * instance = ctx->instance;
            qnn_buffer_0 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src0), 4));
            if (nullptr == qnn_buffer_0) {
                QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
            } else {
                QNN_LOG_INFO("alloc rpcmem successfully\n");
            }
            instance->register_rpcmem(qnn_buffer_0, tensor_0);
            memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
            qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src1), 4));
            if (nullptr == qnn_buffer_1) {
                QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
            } else {
                QNN_LOG_INFO("alloc rpcmem successfully\n");
            }
            instance->register_rpcmem(qnn_buffer_1, tensor_1);
            memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
            qnn_buffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(dst), 4));
            if (nullptr == qnn_buffer_2) {
                QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
            } else {
                QNN_LOG_INFO("alloc rpcmem successfully\n");
            }
            instance->register_rpcmem(qnn_buffer_2, tensor_2);
        }
        Qnn_Tensor_t   tensor_inputs[]  = {*tensor_0, *tensor_1};
        Qnn_Tensor_t   tensor_outputs[] = {*tensor_2};
        Qnn_OpConfig_t op_config        = {(Qnn_OpConfigVersion_t) 1,
                                    .v1 = {qnn_op_config_name.c_str(),
                                           QNN_OP_PACKAGE_NAME_QTI_AISW,
                                           qnn_op_name, 0, qnn_params, 2,
                                           tensor_inputs, 1, tensor_outputs}};
        error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
        if (QNN_SUCCESS != error) {
            QNN_LOG_INFO("error = %d\n", error);
            goto failure;
        }
        error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr);
        if (QNN_SUCCESS != error) {
            QNN_LOG_INFO("error = %d\n", error);
            goto failure;
        }
        error = qnn_raw_interface.graphExecute(graph_handle,
                                            tensor_inputs, 2,
                                           tensor_outputs, 1,
                                           nullptr, nullptr);
        if (QNN_SUCCESS != error) {
            QNN_LOG_INFO("error = %d\n", error);
            goto failure;
        }
        if (ctx->device == QNN_BACKEND_NPU) {
            uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
                    QNN_VER_PTR(*tensor_2)->memHandle));
            memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
        }
        auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2);
        instance->_qnn_graph_map[map_entry] = graph_item;
    } else {
        auto & graph_item = instance->_qnn_graph_map[map_entry];
        graph_handle     = std::get<0>(graph_item);
        tensor_0         = std::get<1>(graph_item);
        tensor_1         = std::get<2>(graph_item);
        tensor_2         = std::get<3>(graph_item);
        uint32_t dimensions_input_0[] = {
            (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
            (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
        uint32_t dimensions_input_1[] = {
            (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
            (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
        uint32_t dimensions_output[] = {
            (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],
            (uint32_t) dst->ne[3]};
        QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
        QNN_VER_PTR(*tensor_0)->rank       = qnn_get_ggml_tensor_rank(src0);
        QNN_VER_PTR(*tensor_0)->dataType   = src0_qnn_type;
        QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
        QNN_VER_PTR(*tensor_1)->rank       = qnn_get_ggml_tensor_rank(src1);
        QNN_VER_PTR(*tensor_1)->dataType   = src1_qnn_type;
        QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output;
        QNN_VER_PTR(*tensor_2)->rank       = qnn_get_ggml_tensor_rank(dst);
        QNN_VER_PTR(*tensor_2)->dataType   = dst_qnn_type;
        if (ctx->device != QNN_BACKEND_NPU) {
            QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data,
                                                 qnn_get_ggml_tensor_data_size(src0)};
            QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
                                                 qnn_get_ggml_tensor_data_size(src1)};
            QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
                                                 qnn_get_ggml_tensor_data_size(dst)};
        } else {
            uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
                    QNN_VER_PTR(*tensor_0)->memHandle));
            if (nullptr != qnn_buffer_0)
                memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
            uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
                    QNN_VER_PTR(*tensor_1)->memHandle));
            if (nullptr != qnn_buffer_1)
                memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
        }
        Qnn_Tensor_t tensor_inputs[]  = {*tensor_0, *tensor_1};
        Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
        error = qnn_raw_interface.graphExecute(graph_handle,
                                            tensor_inputs, 2,
                                           tensor_outputs, 1,
                                           nullptr, nullptr);
        if (QNN_SUCCESS != error) {
            QNN_LOG_INFO("error = %d\n", error);
            goto failure;
        }
        if (ctx->device == QNN_BACKEND_NPU) {
            uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
                    QNN_VER_PTR(*tensor_2)->memHandle));
            if (nullptr != qnn_buffer_2)
                memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
        }
    }
 failure:
    if (QNN_SUCCESS != error) {
        QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0));
        QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1));
        QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2));
        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
                              " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
                      src0->name, src0->type, ggml_type_name(src0->type),
                      src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0],
                      src0->nb[1], src0->nb[2]);
        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
                              " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
                      src1->name, src1->type, ggml_type_name(src1->type),
                      src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0],
                      src1->nb[1], src1->nb[2]);
        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
                              " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
                      dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0],
                      dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]);
        QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2],
                      src0->ne[3]);
    }
    QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions;
    QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions;
    QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions;
    perf.info();
 }
 static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx,
                            const ggml_tensor * src0, const ggml_tensor * src1,
                            ggml_tensor * dst) {
@ -3038,21 +2819,14 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx,
                              struct ggml_compute_params * params,
                              struct ggml_tensor * tensor) {
    ggml_qnn_func_t        func        = nullptr;
    ggml_qnn_func_common_t func_common = nullptr;
    switch (tensor->op) {
    case GGML_OP_ADD:
        func = ggml_qnn_add;
        break;
    case GGML_OP_MUL:
        func_common = ggml_qnn_hanlde_op;
        break;
    case GGML_OP_MUL_MAT:
        func = ggml_qnn_mul_mat;
        break;
    case GGML_OP_REPEAT:
        func = ggml_qnn_repeat;
        break;
@ -3062,15 +2836,12 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx,
    case GGML_OP_DUP:
        func = ggml_qnn_dup;
        break;
    case GGML_OP_ACC:
        func = ggml_qnn_acc;
        break;
    case GGML_OP_DIV:
        func = ggml_qnn_div;
        break;
    case GGML_OP_UNARY:
        switch (ggml_get_unary_op(tensor)) {
        case GGML_UNARY_OP_GELU:
@ -3169,10 +2940,9 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx,
        return false;
    }
-    if (nullptr != func) func(ctx, tensor->src[0], tensor->src[1], tensor);
+    if (nullptr != func) {
-
+        func(ctx, tensor->src[0], tensor->src[1], tensor);
-    if (nullptr != func_common)
+    }
        func_common(ctx, tensor->op, tensor->src[0], tensor->src[1], tensor);
    return true;
 }
@ -3221,41 +2991,28 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t
    }
    Qnn_Tensor_t qnn_tensor = QNN_TENSOR_INIT;
-    if (ctx->device != QNN_BACKEND_GPU) {
+    Qnn_TensorMemType_t qnn_mem_type = QNN_TENSORMEMTYPE_RAW;
-        qnn_tensor = {
+    if (ctx->device == QNN_BACKEND_GPU) {
-                .version = QNN_TENSOR_VERSION_1,
+        qnn_mem_type = QNN_TENSORMEMTYPE_MEMHANDLE;
                {.v1 = {.id         = 0,
                        .name       = tensor_name,
                        .type       = qnn_tensor_type,
                        .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
                        .dataType   = qnn_data_type,
                        .quantizeParams =
                                {QNN_DEFINITION_UNDEFINED,
                                 QNN_QUANTIZATION_ENCODING_UNDEFINED,
                                 {.scaleOffsetEncoding = {.scale  = 0.0000000000000000f,
                                         .offset = 0}}},
                        .rank       = qnn_get_ggml_tensor_rank(tensor),
                        .dimensions = dimensions,
                        .memType    = QNN_TENSORMEMTYPE_RAW,
                        {.clientBuf = {.data = nullptr, .dataSize = 0}}}}};
    } else {
        qnn_tensor = {
                .version = QNN_TENSOR_VERSION_1,
                {.v1 = {.id         = 0,
                        .name       = tensor_name,
                        .type       = qnn_tensor_type,
                        .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
                        .dataType   = qnn_data_type,
                        .quantizeParams =
                                {QNN_DEFINITION_UNDEFINED,
                                 QNN_QUANTIZATION_ENCODING_UNDEFINED,
                                 {.scaleOffsetEncoding = {.scale  = 0.0000000000000000f,
                                         .offset = 0}}},
                        .rank       = qnn_get_ggml_tensor_rank(tensor),
                        .dimensions = dimensions,
                        .memType    = QNN_TENSORMEMTYPE_MEMHANDLE,
                        {.clientBuf = {.data = nullptr, .dataSize = 0}}}}};
    }
    qnn_tensor = {
            .version = QNN_TENSOR_VERSION_1,
            {.v1 = {.id         = 0,
                    .name       = tensor_name,
                    .type       = qnn_tensor_type,
                    .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
                    .dataType   = qnn_data_type,
                    .quantizeParams =
                            {QNN_DEFINITION_UNDEFINED,
                             QNN_QUANTIZATION_ENCODING_UNDEFINED,
                             {.scaleOffsetEncoding = {.scale  = 0.0000000000000000f,
                                     .offset = 0}}},
                    .rank       = qnn_get_ggml_tensor_rank(tensor),
                    .dimensions = dimensions,
                    .memType    = qnn_mem_type,
                    {.clientBuf = {.data = nullptr, .dataSize = 0}}}}};
    Qnn_Tensor_t * p_qnn_tensor =
        (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
    if (nullptr == p_qnn_tensor) {
--- a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh
+++ b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh
@ -12,8 +12,8 @@ ANDROID_PLATFORM=android-34
 GGML_QNN_UT=ggml-qnn-ut
 REMOTE_PATH=/data/local/tmp/
 BUILDTYPE=Debug
 BUILDTYPE=Release
 BUILDTYPE=Debug
 function dump_vars()
@ -100,7 +100,7 @@ function update_qnn_libs()
    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so                 ${REMOTE_PATH}/
    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so                 ${REMOTE_PATH}/
-    #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully
+    #the QNN NPU(aka HTP) backend only verified on Qualcomm Snapdragon 8 Gen 3 equipped Android phone
    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so                 ${REMOTE_PATH}/
    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/
    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so          ${REMOTE_PATH}/
@ -142,14 +142,9 @@ function run_ggml_qnn_ut()
    case "$ggmlop" in
        GGML_OP_ADD)
            echo "adb shell ${REMOTE_PATH}/${GGML_QNN_UT}  -t GGML_OP_ADD -b $qnnbackend"
            adb shell ${REMOTE_PATH}/${GGML_QNN_UT}  -t GGML_OP_ADD -b $qnnbackend
        ;;
        GGML_OP_MUL)
            adb shell ${REMOTE_PATH}/${GGML_QNN_UT}  -t GGML_OP_MUL -b $qnnbackend
        ;;
        GGML_OP_MUL_MAT)
            adb shell ${REMOTE_PATH}/${GGML_QNN_UT}  -t GGML_OP_MUL_MAT -b $qnnbackend
        ;;
@ -169,7 +164,6 @@ function show_usage()
    echo "  $0 build            (build Android command line UT program)"
    echo "  $0 updateqnnlibs    (upload the latest QNN libs to Android phone)"
    echo "  $0 GGML_OP_ADD      0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)"
    echo "  $0 GGML_OP_MUL      0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)"
    echo "  $0 GGML_OP_MUL_MAT  0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)"
    echo -e "\n\n\n"
 }
--- a/tests/ggml-qnn/ggml-qnn-ut.cpp
+++ b/tests/ggml-qnn/ggml-qnn-ut.cpp
@ -346,7 +346,7 @@ static void show_usage() {
        "\nUsage: test_qnn_ops [options]\n" \
        "\n" \
        "Options:\n" \
-        " -t GGML_OP_ADD / GGML_OP_MUL / GGML_OP_MULMAT\n" \
+        " -t GGML_OP_ADD / GGML_OP_MULMAT\n" \
        " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(ggml)\n" \
        " ?/h print usage infomation\n\n"
    );
@ -418,13 +418,9 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) {
    QNN_LOG_DEBUG("sizex: %d\n", sizex);
    QNN_LOG_DEBUG("sizey: %d\n", sizey);
    if (n_ggml_op_type == GGML_OP_MUL) {
        src0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
        src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
    } else {
    src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
    src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
-    }
+
    ggml_set_input(src0);
    ggml_set_input(src1);
@ -432,9 +428,6 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) {
        case GGML_OP_ADD:
            dst = ggml_add(ctx, src0, src1);
            break;
        case GGML_OP_MUL:
            dst = ggml_mul(ctx, src0, src1);
            break;
        case GGML_OP_MUL_MAT:
            dst = ggml_mul_mat(ctx, src0, src1);
            break;
@ -518,8 +511,6 @@ int main(int argc, char * argv[]) {
                    n_ggml_op_type = GGML_OP_ADD;
                } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) {
                    n_ggml_op_type = GGML_OP_MUL_MAT;
                } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL", 11)) {
                    n_ggml_op_type = GGML_OP_MUL;
                } else {
                    show_usage();
                    return 1;