diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index f59c54fca..f268c7f0e 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -55,7 +55,7 @@ #include "Saver/QnnSaver.h" #include "System/QnnSystemInterface.h" #include "HTP/QnnHtpDevice.h" -#include +#include "HTP/QnnHtpGraph.h" // ================================================================================================= // @@ -91,12 +91,6 @@ typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, const ggml_tensor * src1, ggml_tensor * dst); -typedef void (*ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx, - const ggml_op ggml_op, - const ggml_tensor * src0, - const ggml_tensor * src1, - ggml_tensor * dst); - enum qcom_htp_arch { NONE = 0, V68 = 68, @@ -424,6 +418,7 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso return true; } +#ifndef NDEBUG #define CHECK_PARAMS(ctx, src0, src1, dst) \ do { \ if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ @@ -431,6 +426,10 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso } \ } while (0) +#else +#define CHECK_PARAMS(ctx, src0, src1, dst) +#endif + #if ENABLE_QNNBACKEND_PERF class qnn_perf { public: @@ -446,7 +445,7 @@ public: void info() { _end_time = ggml_time_us(); _duration = (_end_time - _begin_time); - QNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); + QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); } private: @@ -809,7 +808,7 @@ static void qnn_sdk_logcallback(const char * fmt, QnnLog_Level_t level, memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); - QNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); + QNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); } #endif } @@ -1069,7 +1068,7 @@ class qnn_instance { arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; arch_devconfig.customConfig = &arch_customconfig; - const QnnDevice_Config_t * p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, NULL}; + const QnnDevice_Config_t * p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, nullptr}; qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); } else { qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); @@ -1137,10 +1136,14 @@ class qnn_instance { _pfn_rpc_mem_init(); } - std::vector temp_context_config; + /* TODO: not used, keep it for further usage + QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; + qnn_context_config.priority = QNN_PRIORITY_DEFAULT; + const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; + */ _qnn_interface.qnn_context_create( _qnn_backend_handle, _qnn_device_handle, - temp_context_config.empty() ? nullptr : temp_context_config.data(), + nullptr, &_qnn_context_handle); if (nullptr == _qnn_context_handle) { QNN_LOG_WARN("why failed to initialize qnn context\n"); @@ -1157,9 +1160,11 @@ class qnn_instance { size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); for (size_t idx = 0; idx < probe_counts; idx++) { - rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, 4)); + rpc_buffer = static_cast(alloc_rpcmem( + probe_slots[idx] * size_in_mb, 4)); if (nullptr == rpc_buffer) { - QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", + probe_slots[idx], strerror(errno)); break; } else { candidate_size = probe_slots[idx]; @@ -1262,8 +1267,8 @@ class qnn_instance { return ret_status; } - //keep it for further usage of offload the entire cgraph to a single QNN DAG directly - //which was used in Qualcomm's dedicated AI technology + //TODO:keep it for further usage of offload the entire cgraph to a single QNN DAG directly + // which was used in Qualcomm's dedicated AI technology #if 0 int init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation = true, @@ -1430,13 +1435,14 @@ class qnn_instance { QnnHtpPerfInfrastructure_PowerConfig_t power_config; memset(&power_config, 0, sizeof(power_config)); power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; - power_config.dcvsV3Config.dcvsEnable = 0; + power_config.dcvsV3Config.setDcvsEnable = 1; + power_config.dcvsV3Config.dcvsEnable = 0; power_config.dcvsV3Config.contextId = _qnn_power_configid; power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; power_config.dcvsV3Config.setSleepLatency = 1; // true to consider Latency parameter otherwise false - power_config.dcvsV3Config.sleepLatency = 10; + power_config.dcvsV3Config.sleepLatency = 40; power_config.dcvsV3Config.setBusParams = 1; // true to consider Bus parameter otherwise false power_config.dcvsV3Config.setCoreParams = @@ -1459,6 +1465,7 @@ class qnn_instance { DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set power config with different performance parameters const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { &power_config, nullptr}; @@ -1550,6 +1557,7 @@ class qnn_instance { QNN_LOG_WARN("rpc memory already allocated\n"); return 3; } + if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); @@ -1710,7 +1718,7 @@ class qnn_instance { int result = 0; if (nullptr == _system_lib_handle) { - QNN_LOG_DEBUG("system lib handle is null\n"); + QNN_LOG_WARN("system lib handle is null\n"); return 1; } @@ -1724,8 +1732,7 @@ class qnn_instance { int dlclose_error = dlclose(_system_lib_handle); if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", - dlerror()); + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); return 2; } @@ -1740,8 +1747,7 @@ class qnn_instance { void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); if (nullptr == lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s", - lib_path.c_str(), dlerror()); + QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); return 1; } @@ -1749,8 +1755,7 @@ class qnn_instance { load_qnn_functionpointers( lib_handle, "QnnInterface_getProviders"); if (nullptr == get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", - dlerror()); + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); return 2; } @@ -1758,14 +1763,12 @@ class qnn_instance { const QnnInterface_t ** provider_list = nullptr; error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", - QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); return 3; } QNN_LOG_DEBUG("num_providers=%d\n", num_providers); if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, - _required_num_providers); + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); return 4; } @@ -1797,16 +1800,14 @@ class qnn_instance { BackendIdType backend_id = provider_list[0]->backendId; _lib_path_to_backend_id[lib_path] = backend_id; if (_loaded_backend.count(backend_id) > 0) { - QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", - lib_path.c_str(), backend_id); + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id); } _loaded_backend[backend_id] = provider_list[0]; if (_loaded_lib_handle.count(backend_id) > 0) { QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s\n", - _loaded_lib_handle[backend_id], dlerror()); + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); } } _loaded_lib_handle[backend_id] = lib_handle; @@ -1820,8 +1821,7 @@ class qnn_instance { for (auto & it : _loaded_lib_handle) { dlclose_error = dlclose(it.second); if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, - dlerror()); + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); } } @@ -1924,7 +1924,6 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, const int64_t ne01 = src0->ne[1]; const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - // make qnn_get_ggml_tensor_rank and QNN SDK happy if (ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1) { return false; @@ -1932,13 +1931,13 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, // TODO: support other GGML OPs using QNN API // a GENERAL approach could fix this problem in a standalone PR of refine ggml backend - // subsystem for mixed inference between CPU&GPU / CPU&NPU easily for ANY ggml backends - // which the backend's ggml_backend_xxx_buffer_is_host return true. - // this approach could be found: + // subsystem for hybrid inference between CPU&GPU / CPU&NPU easily(less the 100 LoC and no + // side-effect to the existing codes) for ANY ggml backends which the backend's + // ggml_backend_xxx_buffer_is_host return true. this approach could be found at: // https://github.com/ggerganov/llama.cpp/pull/7641 bool supported_op = false; supported_op = (tensor->op == GGML_OP_ADD); - supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT)); + supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL_MAT)); if (!supported_op) { return false; } @@ -1950,14 +1949,9 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, } } - int qtype = src0->type; - if (tensor->op == GGML_OP_MUL) { - return (qtype == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32); - } - if (tensor->op == GGML_OP_MUL_MAT) { if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) { - //make mul_mat with QNN RPC happy + //comment it for make UT of mul_mat with QNN RPC happy //return false; } } @@ -1965,6 +1959,8 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, return true; } +//TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat +// keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; @@ -1986,10 +1982,11 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + qnn_perf perf("ggml_qnn_add"); perf.start(); - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; @@ -2034,17 +2031,31 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QnnHtpGraph_CustomConfig_t dlbc_config; dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - /* dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC - */ - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC QnnGraph_Config_t graph_dlbc_config; graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_dlbc_config.customConfig = &dlbc_config; - const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL}; + QnnHtpGraph_CustomConfig_t opt_config; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; // 1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, + &graph_dlbc_config, + &graph_vtcm_config, + &graph_opt_config, + NULL}; error = qnn_raw_interface.graphCreate( instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, &graph_handle); @@ -2113,27 +2124,33 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src uint8_t * qnn_buffer_2 = nullptr; qnn_instance * instance = ctx->instance; - qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); + qnn_buffer_0 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(src0), 4)); if (nullptr == qnn_buffer_0) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } instance->register_rpcmem(qnn_buffer_0, tensor_0); memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); + qnn_buffer_1 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(src1), 4)); if (nullptr == qnn_buffer_1) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } instance->register_rpcmem(qnn_buffer_1, tensor_1); memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); + qnn_buffer_2 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(dst), 4)); if (nullptr == qnn_buffer_2) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } @@ -2144,23 +2161,33 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t) 1, - .v1 = {"ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_ELEMENT_WISE_ADD, 0, qnn_params, - 2, tensor_inputs, 1, - tensor_outputs}}; + .v1 = {"ggml_op_add", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_ELEMENT_WISE_ADD, + 0, qnn_params, + 2, tensor_inputs, + 1,tensor_outputs} + }; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + error = qnn_raw_interface.graphFinalize(graph_handle, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; @@ -2221,9 +2248,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs,2, tensor_outputs,1, nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; @@ -2299,6 +2332,8 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + qnn_perf perf("ggml_qnn_mul_mat"); perf.start(); @@ -2307,7 +2342,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; @@ -2338,6 +2372,11 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; + //TODO: for scenarios of quantized data in src0 + // pass-1: dequantize src0 to FP32 + // pass-2: dq-src0 * src1 + // the performance gains is worth although there is performance loss in pass-1 + if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; @@ -2352,17 +2391,31 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QnnHtpGraph_CustomConfig_t dlbc_config; dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - /* dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC - */ - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC QnnGraph_Config_t graph_dlbc_config; graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_dlbc_config.customConfig = &dlbc_config; - const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL}; + QnnHtpGraph_CustomConfig_t opt_config; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; //1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, + &graph_dlbc_config, + &graph_vtcm_config, + &graph_opt_config, + NULL}; error = qnn_raw_interface.graphCreate( instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, &graph_handle); @@ -2428,27 +2481,33 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, uint8_t * qnn_buffer_2 = nullptr; qnn_instance * instance = ctx->instance; - qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); + qnn_buffer_0 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(src0), 4)); if (nullptr == qnn_buffer_0) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } instance->register_rpcmem(qnn_buffer_0, tensor_0); memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); + qnn_buffer_1 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(src1), 4)); if (nullptr == qnn_buffer_1) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } instance->register_rpcmem(qnn_buffer_1, tensor_1); memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); + qnn_buffer_2 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(dst), 4)); if (nullptr == qnn_buffer_2) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } @@ -2457,25 +2516,35 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1, - .v1 = {"ggml_op_mul_mat", - QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_MAT_MUL, 0, qnn_params, 2, - tensor_inputs, 1, tensor_outputs}}; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t) 1, + .v1 = {"ggml_op_mul_mat", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_MAT_MUL, + 0, qnn_params, + 2, tensor_inputs, + 1, tensor_outputs} + }; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + error = qnn_raw_interface.graphFinalize(graph_handle, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, + tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; @@ -2537,9 +2606,14 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, + tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; @@ -2580,299 +2654,6 @@ failure: perf.info(); } -// common function for GGML OPs using QNN API -static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, - const enum ggml_op ggmlop, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn_instance * instance = nullptr; - std::string qnn_graph_name = "ggml_qnn_graph"; - std::string qnn_op_config_name = "ggml_qnn_op_config"; - const char * qnn_op_name = nullptr; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; - Qnn_Param_t qnn_params[] = {}; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; - - CHECK_PARAMS(ctx, src0, src1, dst); - tensor_0 = (Qnn_Tensor_t *) src0->extra; - tensor_1 = (Qnn_Tensor_t *) src1->extra; - tensor_2 = (Qnn_Tensor_t *) dst->extra; - instance = ctx->instance; - qnn_perf perf(ggml_op_name(ggmlop)); - perf.start(); - - qnn_op_name = qnn_opname_from_ggmlop(ggmlop); - if (nullptr == qnn_op_name) { - QNN_LOG_WARN("ggml op %d(%s) not supported currently", ggmlop, ggml_op_name(ggmlop)); - return; - } - - tensor_0 = (Qnn_Tensor_t *) src0->extra; - tensor_1 = (Qnn_Tensor_t *) src1->extra; - tensor_2 = (Qnn_Tensor_t *) dst->extra; - instance = ctx->instance; - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; - - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - - QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; - QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; - QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - - uint32_t dimensions_input_0[] = { - (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], - (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = { - (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], - (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = { - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], - (uint32_t) dst->ne[3]}; - - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != - instance->_qnn_graph_map.end()) { - graph_initialized = true; - auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - } - - uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; - uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; - uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; - - if (!graph_initialized) { - qnn_graph_name = qnn_graph_name + "_" + ggml_op_name(ggmlop) + - std::to_string(ctx->threads) + src0->name + "_" + - src1->name; - qnn_op_config_name = qnn_op_config_name + "_" + ggml_op_name(ggmlop) + - std::to_string(ctx->threads) + src0->name + "_" + - src1->name; - QNN_LOG_DEBUG("qnn graph name %s", qnn_graph_name.c_str()); - QNN_LOG_DEBUG("qnn op_config name %s", qnn_op_config_name.c_str()); - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), qnn_graph_name.c_str(), nullptr, - &graph_handle); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph " - "name %s, error = %d\n", - ggml_op_name(ggmlop), qnn_graph_name.c_str(), error); - goto failure; - } - - if (ctx->device == QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0}; - - QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; - - QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; - } - - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; - QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; - } else { - uint8_t * qnn_buffer_0 = nullptr; - uint8_t * qnn_buffer_1 = nullptr; - uint8_t * qnn_buffer_2 = nullptr; - qnn_instance * instance = ctx->instance; - - qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); - if (nullptr == qnn_buffer_0) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_0, tensor_0); - memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - - qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); - if (nullptr == qnn_buffer_1) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_1, tensor_1); - memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - - qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); - if (nullptr == qnn_buffer_2) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_2, tensor_2); - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1, - .v1 = {qnn_op_config_name.c_str(), - QNN_OP_PACKAGE_NAME_QTI_AISW, - qnn_op_name, 0, qnn_params, 2, - tensor_inputs, 1, tensor_outputs}}; - error = qnn_raw_interface.graphAddNode(graph_handle, op_config); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - - if (ctx->device == QNN_BACKEND_NPU) { - uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_2)->memHandle)); - memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); - } - - auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); - instance->_qnn_graph_map[map_entry] = graph_item; - } else { - auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - tensor_0 = std::get<1>(graph_item); - tensor_1 = std::get<2>(graph_item); - tensor_2 = std::get<3>(graph_item); - - uint32_t dimensions_input_0[] = { - (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = { - (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = { - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], - (uint32_t) dst->ne[3]}; - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; - QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; - } else { - uint8_t * qnn_buffer_0 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_0)->memHandle)); - if (nullptr != qnn_buffer_0) - memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - - uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_1)->memHandle)); - if (nullptr != qnn_buffer_1) - memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - - if (ctx->device == QNN_BACKEND_NPU) { - uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_2)->memHandle)); - if (nullptr != qnn_buffer_2) - memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); - } - } - -failure: - if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], - dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); - } - - QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; - QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; - QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - perf.info(); -} - static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3038,21 +2819,14 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_compute_params * params, struct ggml_tensor * tensor) { ggml_qnn_func_t func = nullptr; - ggml_qnn_func_common_t func_common = nullptr; switch (tensor->op) { case GGML_OP_ADD: func = ggml_qnn_add; break; - - case GGML_OP_MUL: - func_common = ggml_qnn_hanlde_op; - break; - case GGML_OP_MUL_MAT: func = ggml_qnn_mul_mat; break; - case GGML_OP_REPEAT: func = ggml_qnn_repeat; break; @@ -3062,15 +2836,12 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, case GGML_OP_DUP: func = ggml_qnn_dup; break; - case GGML_OP_ACC: func = ggml_qnn_acc; break; - case GGML_OP_DIV: func = ggml_qnn_div; break; - case GGML_OP_UNARY: switch (ggml_get_unary_op(tensor)) { case GGML_UNARY_OP_GELU: @@ -3169,10 +2940,9 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, return false; } - if (nullptr != func) func(ctx, tensor->src[0], tensor->src[1], tensor); - - if (nullptr != func_common) - func_common(ctx, tensor->op, tensor->src[0], tensor->src[1], tensor); + if (nullptr != func) { + func(ctx, tensor->src[0], tensor->src[1], tensor); + } return true; } @@ -3221,41 +2991,28 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t } Qnn_Tensor_t qnn_tensor = QNN_TENSOR_INIT; - if (ctx->device != QNN_BACKEND_GPU) { - qnn_tensor = { - .version = QNN_TENSOR_VERSION_1, - {.v1 = {.id = 0, - .name = tensor_name, - .type = qnn_tensor_type, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = qnn_data_type, - .quantizeParams = - {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, - .offset = 0}}}, - .rank = qnn_get_ggml_tensor_rank(tensor), - .dimensions = dimensions, - .memType = QNN_TENSORMEMTYPE_RAW, - {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; - } else { - qnn_tensor = { - .version = QNN_TENSOR_VERSION_1, - {.v1 = {.id = 0, - .name = tensor_name, - .type = qnn_tensor_type, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = qnn_data_type, - .quantizeParams = - {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, - .offset = 0}}}, - .rank = qnn_get_ggml_tensor_rank(tensor), - .dimensions = dimensions, - .memType = QNN_TENSORMEMTYPE_MEMHANDLE, - {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; + Qnn_TensorMemType_t qnn_mem_type = QNN_TENSORMEMTYPE_RAW; + if (ctx->device == QNN_BACKEND_GPU) { + qnn_mem_type = QNN_TENSORMEMTYPE_MEMHANDLE; } + + qnn_tensor = { + .version = QNN_TENSOR_VERSION_1, + {.v1 = {.id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = + {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, + .offset = 0}}}, + .rank = qnn_get_ggml_tensor_rank(tensor), + .dimensions = dimensions, + .memType = qnn_mem_type, + {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; + Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); if (nullptr == p_qnn_tensor) { diff --git a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh index 4c21be5a4..e12b987b8 100755 --- a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh +++ b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh @@ -12,8 +12,8 @@ ANDROID_PLATFORM=android-34 GGML_QNN_UT=ggml-qnn-ut REMOTE_PATH=/data/local/tmp/ -BUILDTYPE=Debug BUILDTYPE=Release +BUILDTYPE=Debug function dump_vars() @@ -100,7 +100,7 @@ function update_qnn_libs() adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ - #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully + #the QNN NPU(aka HTP) backend only verified on Qualcomm Snapdragon 8 Gen 3 equipped Android phone adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ @@ -142,14 +142,9 @@ function run_ggml_qnn_ut() case "$ggmlop" in GGML_OP_ADD) - echo "adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend" adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend ;; - GGML_OP_MUL) - adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL -b $qnnbackend - ;; - GGML_OP_MUL_MAT) adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL_MAT -b $qnnbackend ;; @@ -169,7 +164,6 @@ function show_usage() echo " $0 build (build Android command line UT program)" echo " $0 updateqnnlibs (upload the latest QNN libs to Android phone)" echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" - echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" echo -e "\n\n\n" } diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 0abfc6207..fa0883af8 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -346,7 +346,7 @@ static void show_usage() { "\nUsage: test_qnn_ops [options]\n" \ "\n" \ "Options:\n" \ - " -t GGML_OP_ADD / GGML_OP_MUL / GGML_OP_MULMAT\n" \ + " -t GGML_OP_ADD / GGML_OP_MULMAT\n" \ " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(ggml)\n" \ " ?/h print usage infomation\n\n" ); @@ -418,13 +418,9 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { QNN_LOG_DEBUG("sizex: %d\n", sizex); QNN_LOG_DEBUG("sizey: %d\n", sizey); - if (n_ggml_op_type == GGML_OP_MUL) { - src0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - } else { - src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - } + src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + ggml_set_input(src0); ggml_set_input(src1); @@ -432,9 +428,6 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { case GGML_OP_ADD: dst = ggml_add(ctx, src0, src1); break; - case GGML_OP_MUL: - dst = ggml_mul(ctx, src0, src1); - break; case GGML_OP_MUL_MAT: dst = ggml_mul_mat(ctx, src0, src1); break; @@ -518,8 +511,6 @@ int main(int argc, char * argv[]) { n_ggml_op_type = GGML_OP_ADD; } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) { n_ggml_op_type = GGML_OP_MUL_MAT; - } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL", 11)) { - n_ggml_op_type = GGML_OP_MUL; } else { show_usage(); return 1;