From dd29834c115f5c644b34fb7e60c0175b9890da29 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Thu, 6 Jun 2024 17:12:28 +0800 Subject: [PATCH] add supportive of quantize data type Q8_0 --- ggml-qnn.cpp | 176 ++++++----- ggml-qnn.h | 5 +- tests/ggml-qnn/ggml-qnn-ut-build-run.sh | 37 ++- tests/ggml-qnn/ggml-qnn-ut.cpp | 390 +++++++++++++++--------- 4 files changed, 379 insertions(+), 229 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 15c6538d1..d0927f22e 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -72,8 +72,6 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define RPCMEM_DEFAULT_FLAGS 1 #define RPCMEM_HEAP_ID_SYSTEM 25 -#define GGML_DUMP_TENSOR(tensor) ggml_tensor_dump(tensor, #tensor) - #define GGML_QNN_LOGBUF_LEN 4096 #define GGML_QNN_DEBUG 1 //for troubleshooting QNN backend @@ -195,8 +193,17 @@ static ggml_backend_t g_qnn_backend = nullptr; static int g_current_device = QNN_BACKEND_GGML; - -//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently +//according to the QNN SDK Reference Guide, +//CPU - Choose a non-quantized model. Quantized models are currently incompatible with the CPU backend +//GPU - Choose a non-quantized model. Quantized models are currently incompatible with the GPU backend +//HTP - Choose a quantized model. Quantized models are required when running on the HTP backend +//DSP - Choose a quantized model. Quantized models are required when running on the DSP backend +//HTA - Choose a quantized model. Quantized models are required when running on the HTA backend +// +//only focus on Qualcomm CPU/GPU/NPU backend in this implementation of QNN backend for ggml currently +//Qualcomm CPU: Qualcomm Kryo CPU +//Qualcomm GPU: Qualcomm Adreno GPU +//Qualcomm NPU: aka HTP(Hexagon Tensor Processor), ~= cDSP(Compute DSP) + HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator) static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { [QNN_BACKEND_CPU] = {.device = 0, .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, [QNN_BACKEND_GPU] = {.device = 1, .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, @@ -849,6 +856,10 @@ static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { return QNN_DATATYPE_FLOAT_16; case GGML_TYPE_F32: return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_I8: + return QNN_DATATYPE_INT_8; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_SFIXED_POINT_8; default: break; @@ -903,14 +914,8 @@ static const char * get_qnn_backend_name(int n_backend_type) { case 2: return "QNN-NPU"; case 3: - return "ggml"; //the default GGML backend, used to compare performance between QNN backend and the default GGML backend + return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML -#if 0 //QNN cDSP and HTA backend would not be used currently, focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently - case 3: - return "QNN-cDSP"; - case 4: - return "QNN-HTA"; -#endif default: return "unknown"; } @@ -1720,7 +1725,7 @@ static void ggml_qnn_logcallback(const char * fmt, double ms = (double) timestamp / 1000000.0; - { + if (0) { std::lock_guard lock(log_mutex); memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN); @@ -1770,7 +1775,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { _qnn_raw_interface.logCreate(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); #endif if (nullptr == _qnn_log_handle) { - QNN_LOG_WARN("why failed to initialize qnn log\n"); //DSP backend not work on Qualcomm SoC based low-end phone + QNN_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone return 4; } else { QNN_LOG_DEBUG("initialize qnn log successfully\n"); @@ -2010,14 +2015,14 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum const struct ggml_tensor * src0 = tensor->src[0]; const struct ggml_tensor * src1 = tensor->src[1]; - const int64_t ne00 = tensor->src[0]->ne[0]; - const int64_t ne01 = tensor->src[0]->ne[1]; + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; - const int64_t ne10 = tensor->src[1]->ne[0]; - const int64_t ne11 = tensor->src[1]->ne[1]; + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; - const int64_t ne0 = tensor->ne[0]; - const int64_t ne1 = tensor->ne[1]; + const int64_t ne0 = tensor->ne[0]; + const int64_t ne1 = tensor->ne[1]; GGML_UNUSED(ne0); GGML_UNUSED(ne1); @@ -2057,30 +2062,15 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum return false; } - if (tensor->op == GGML_OP_ADD) { - //TODO: this is limitation - return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) - && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) - && (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16); - + // GPU/NPU inference will slower then CPU inference when tensor->ne[1] < min batch size + if (tensor->ne[1] < 32) { + return false; } - if (tensor->op == GGML_OP_MUL_MAT) { - //TODO: this is limitation - return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) - && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) - && (src0->type == src1->type) && (src0->type == tensor->type); + int qtype = src0->type; + return (qtype == GGML_TYPE_F32 || qtype == GGML_TYPE_F16 || qtype == GGML_TYPE_Q8_0) + && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); - if (tensor->ne[1] < 32) { // GPU/NPU inference will slower then CPU inference when tensor->ne[1] < min batch size - return false; - } - - } - - //TODO: this is limitation - return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) - && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) - && (src0->type == src1->type) && (src0->type == tensor->type); } @@ -2129,7 +2119,7 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; n_begin_time = ggml_time_us(); -#if 1 + QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, @@ -2147,17 +2137,23 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); -#endif QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - std::string map_entry = std::string(ggml_op_name(ggmlop)); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { graph_initialized = true; auto & graph_item = instance->_qnn_graph_map[map_entry]; @@ -2197,6 +2193,16 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + Qnn_Tensor_t tensor_inputs[] = { *tensor_0, *tensor_1 @@ -2245,6 +2251,11 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; @@ -2255,10 +2266,6 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; - Qnn_Tensor_t tensor_inputs[] = { *tensor_0, *tensor_1 @@ -2337,7 +2344,6 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; n_begin_time = ggml_time_us(); -#if 1 QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, @@ -2355,17 +2361,23 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); -#endif QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - std::string map_entry = std::string(ggml_op_name(ggmlop)); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { graph_initialized = true; auto & graph_item = instance->_qnn_graph_map[map_entry]; @@ -2401,6 +2413,16 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + Qnn_Tensor_t tensor_inputs[] = { *tensor_0, *tensor_1 @@ -2543,7 +2565,7 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr } n_begin_time = ggml_time_us(); -#if 1 + QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, @@ -2561,11 +2583,17 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); -#endif QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { @@ -2606,6 +2634,16 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + Qnn_Tensor_t tensor_inputs[] = { *tensor_0, *tensor_1 @@ -3125,10 +3163,9 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t char tensor_name[GGML_MAX_NAME] = { 0 }; snprintf(tensor_name, GGML_MAX_NAME, "tensor_%2d", idx++); - uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; - //TODO:only support FP32 & FP16 - Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32; - Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; + Qnn_DataType_t qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); + Qnn_TensorType_t qnn_tensor_type= QNN_TENSOR_TYPE_APP_WRITE; if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; @@ -3365,7 +3402,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const //note: this function be used with proposal/refined ggml backend subsystem in this PR: // https://github.com/ggerganov/llama.cpp/pull/7641 -// new ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) +// any ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) // can following this style for mixed inference between CPU&GPU / CPU&NPU very easily GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) { GGML_UNUSED(backend); @@ -3481,7 +3518,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) { /** * - * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU(aka HTP/DSP) + * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer * @return */ @@ -3516,22 +3553,21 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { (path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), 1)) { - QNN_LOG_INFO("QNN DSP backend setenv successfully"); + QNN_LOG_INFO("QNN NPU backend setenv successfully"); } else { - QNN_LOG_ERROR("QNN DSP backend setenv failure"); + QNN_LOG_ERROR("QNN NPU backend setenv failure"); } if (0 == setenv("ADSP_LIBRARY_PATH", (path + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(), 1)) { - QNN_LOG_INFO("QNN DSP backend setenv successfully"); + QNN_LOG_INFO("QNN NPU backend setenv successfully"); } else { - QNN_LOG_ERROR("QNN DSP backend setenv failure"); + QNN_LOG_ERROR("QNN NPU backend setenv failure"); } } else { if (0 == setenv("LD_LIBRARY_PATH", - (path + - ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), + path.c_str(), 1)) { QNN_LOG_INFO("%s backend setenv successfully\n", get_qnn_backend_name(device)); } else { diff --git a/ggml-qnn.h b/ggml-qnn.h index c61ebd25d..9ea3dcda6 100644 --- a/ggml-qnn.h +++ b/ggml-qnn.h @@ -10,19 +10,18 @@ extern "C" { #define GGML_QNN_MAX_DEVICES 3 -//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently enum QNNBackend { QNN_BACKEND_CPU, QNN_BACKEND_GPU, QNN_BACKEND_NPU, - QNN_BACKEND_GGML, //"fake" QNN backend just for compare performance between QNN and original GGML + QNN_BACKEND_GGML, //"fake" QNN backend, used for compare performance between QNN and original GGML }; GGML_API int ggml_backend_qnn_reg_devices(void); /** * - * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU(aka HTP/DSP) + * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer * @return */ diff --git a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh index c7bff2ee9..192f2f4bd 100755 --- a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh +++ b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh @@ -4,7 +4,8 @@ set -e #https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct #https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools -QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/ +#QNN SDK released on 20240531 +QNN_SDK_PATH=/opt/qcom/aistack/qairt/2.23.0.240531/ ANDROID_NDK=`pwd`/android-ndk-r26c ANDROID_PLATFORM=android-34 @@ -89,6 +90,23 @@ function remove_temp_dir() } +function update_qnn_libs() +{ + check_qnn_sdk + + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ + + #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/ +} + + function check_qnn_libs() { #reuse the cached qnn libs in Android phone @@ -96,16 +114,7 @@ function check_qnn_libs() if [ $? -eq 0 ]; then printf "QNN libs already exist on Android phone\n" else - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ - - #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/ + update_qnn_libs fi } @@ -155,7 +164,8 @@ function run_ggml_qnn_ut() function show_usage() { echo "Usage:" - echo " $0 build" + echo " $0 build (build Android command line UT program)" + echo " $0 updateqnnlibs (upload the latest QNN libs to Android phone)" echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" @@ -183,6 +193,9 @@ elif [ $# == 1 ]; then elif [ "$1" == "build" ]; then build_ggml_qnn_ut exit 0 + elif [ "$1" == "updateqnnlibs" ]; then + update_qnn_libs + exit 0 else ggmlop=$1 qnnbackend=0 diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 27967270b..1041252f3 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -87,7 +87,7 @@ static const char * get_qnn_backend_name(int n_backend_type) { case 1: return "QNN-GPU"; case 2: - return "QNN-NPU(HTP/DSP)"; + return "QNN-NPU"; case 3: return "ggml"; default: @@ -131,9 +131,54 @@ static bool ggml_graph_compute_helper( } -static void tensor_dump_elements(const ggml_tensor * tensor) { +#define QK8_0 32 +typedef struct { + uint16_t d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0; + + +static inline float ggml_compute_fp16_to_fp32(uint16_t h) { + __fp16 tmp; + memcpy(&tmp, &h, sizeof(uint16_t)); + return (float)tmp; +} +#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) + +static void tensor_dump(const ggml_tensor * tensor, const char * name) { + QNN_LOG_DEBUG("dump ggml tensor %s(%s): type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + name, tensor->name, + tensor->type, ggml_type_name(tensor->type), + tensor->ne[0], tensor->ne[1], tensor->ne[2], + tensor->nb[0], tensor->nb[1], tensor->nb[2]); + float value = 0; std::ostringstream tmposs; + if (nullptr == tensor) { + QNN_LOG_WARN("tensor is null"); + return; + } + if (tensor->type == GGML_TYPE_I8) { + for (int h = 0; h < tensor->ne[3]; h++) { + for (int i = 0; i < tensor->ne[2]; i++) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + value = ((int8_t *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + + j * tensor->ne[0] + k]; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value + << " "; + } + tmposs << "\n"; + } + } + } + if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { + QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + tmposs.clear(); + tmposs.str(""); + } + } + if (tensor->type == GGML_TYPE_F32) { for (int h = 0; h < tensor->ne[3]; h++) { for (int i = 0; i < tensor->ne[2]; i++) { @@ -144,31 +189,59 @@ static void tensor_dump_elements(const ggml_tensor * tensor) { tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; } - if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { - QNN_LOG_DEBUG("%s", tmposs.str().c_str()); - } - tmposs.clear(); - tmposs.str(""); - //QNN_LOG_DEBUG("\n"); + tmposs << "\n"; } } } + if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { + QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + tmposs.clear(); + tmposs.str(""); + } } - //QNN_LOG_DEBUG("\n"); -} + if (tensor->type == GGML_TYPE_F16) { + for (int h = 0; h < tensor->ne[3]; h++) { + for (int i = 0; i < tensor->ne[2]; i++) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + unsigned short tmpvalue = ((unsigned short *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + + j * tensor->ne[0] + k]; + value = GGML_FP16_TO_FP32(tmpvalue); + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value + << " "; + } + tmposs << "\n"; + } + } + } + if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { + QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + tmposs.clear(); + tmposs.str(""); + } + } - -static void tensor_dump(const ggml_tensor * tensor, const char * name) { - QNN_LOG_DEBUG("dump ggml tensor %s(%s)", name, tensor->name); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)", - name, - tensor->type, ggml_type_name(tensor->type), - tensor->ne[0], tensor->ne[1], tensor->ne[2], - tensor->nb[0], tensor->nb[1], tensor->nb[2]); - tensor_dump_elements(tensor); - - QNN_LOG_DEBUG("\n"); + if (tensor->type == GGML_TYPE_Q8_0) { + block_q8_0 * tmp = ((block_q8_0 *)tensor->data); + for (int j = 0; j < tensor->ne[1]; j++) { + int n = tensor->ne[0] / QK8_0; //blocks per row + for (int z = 0; z < n; z++) { + const float d = GGML_FP16_TO_FP32(tmp[ j * n + z ].d); + for (int k = 0; k < QK8_0; k++) { + value = tmp[j * n + z].qs[k] * d; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value + << " "; + } + } + tmposs << "\n"; + } + if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { + QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + tmposs.clear(); + tmposs.str(""); + } + } } @@ -231,7 +304,8 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m t.join(); } if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { - ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); + //ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); + memcpy((char*)tensor->data, data.data(), size * sizeof(float)); } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) { GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); std::vector dataq(ggml_row_size(tensor->type, size)); @@ -246,10 +320,12 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m } ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im); GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size())); - ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); + //ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); + memcpy((char*)tensor->data, dataq.data(), dataq.size()); } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { // This is going to create some weird integers though. - ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); + //ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); + memcpy((char*)tensor->data, data.data(), ggml_nbytes(tensor)); } else { GGML_ASSERT(false); } @@ -276,16 +352,13 @@ static void show_usage() { } -int main(int argc, char * argv[]) { +static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { int64_t n_begin_time = 0LL; int64_t n_end_time = 0LL; int64_t n_duration = 0LL; size_t ctx_size = 0; int sizey = 4; int sizex = 4; - int num_threads = 4; - int n_backend_type = QNN_BACKEND_CPU; - int n_ggml_op_type = GGML_OP_ADD; struct ggml_context * ctx = nullptr; struct ggml_cgraph * gf = nullptr; @@ -294,8 +367,150 @@ int main(int argc, char * argv[]) { struct ggml_tensor * dst = nullptr; ggml_backend_t backend = nullptr; ggml_backend_buffer_t buffer= nullptr; - ggml_type qtype = GGML_TYPE_F32; + + ggml_type qtype = GGML_TYPE_I8; + qtype = GGML_TYPE_F32; + qtype = GGML_TYPE_F16; + qtype = GGML_TYPE_Q8_0; + std::vector work_buffer; + QNN_LOG_DEBUG("enter qnn_ggml_op\n"); + QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); + + + n_begin_time = ggml_time_us(); + srand(time(NULL)); + + ctx_size += 1024 * 1024 * 32; + QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size, + (ctx_size / 1024 / 1024)); + + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /* no_alloc =*/ 0 + }; + + if (n_backend_type != QNN_BACKEND_GGML) { + params.no_alloc = true; + backend = ggml_backend_qnn_init(n_backend_type, "/data/local/tmp/"); + if (nullptr == backend) { + QNN_LOG_ERROR("create qnn backend %d(%s) failed\n", n_backend_type, get_qnn_backend_name(n_backend_type)); + return 1; + } + } + + ctx = ggml_init(params); + if (!ctx) { + QNN_LOG_ERROR("%s: ggml_init() failed\n"); + return 2; + } + + QNN_LOG_DEBUG("creating new tensors\n"); + QNN_LOG_DEBUG("ggml_blck_size(%s) %d\n", ggml_type_name(qtype), ggml_blck_size(qtype)); + QNN_LOG_DEBUG("ggml_type_size(%s) %d\n", ggml_type_name(qtype), ggml_type_size(qtype)); + if (ggml_is_quantized(qtype)) { + sizex = ggml_blck_size(qtype); + + if (n_ggml_op_type == GGML_OP_MUL_MAT) { + sizex = ggml_blck_size(qtype) * 2; + } + } + QNN_LOG_DEBUG("sizex %d\n", sizex); + + if (n_ggml_op_type == GGML_OP_MUL) { + src0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + } else { + src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + } + ggml_set_input(src0); + ggml_set_input(src1); + + switch (n_ggml_op_type) { + case GGML_OP_ADD: + dst = ggml_add(ctx, src0, src1); + break; + case GGML_OP_MUL: + dst = ggml_mul(ctx, src0, src1); + break; + case GGML_OP_MUL_MAT: + dst = ggml_mul_mat(ctx, src0, src1); + break; + default: + QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, + ggml_op_name((enum ggml_op) n_ggml_op_type)); + ggml_free(ctx); + ggml_backend_free(backend); + return 3; + } + + ggml_set_output(dst); +#ifdef GGML_USE_QNN + if (n_backend_type != QNN_BACKEND_GGML) { + buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); + if (!buffer) { + QNN_LOG_ERROR("%s: failed to allocate backend buffer\n", __func__); + ggml_free(ctx); + ggml_backend_free(backend); + return 4; + } + } +#endif + + QNN_LOG_DEBUG("creating compute graph\n"); + gf = ggml_new_graph(ctx); + ggml_build_forward_expand(gf, dst); + + if (n_backend_type != QNN_BACKEND_GGML) { + initialize_tensors(ctx); + } else { + if (qtype == GGML_TYPE_F32) { + ggml_set_f32(src0, (rand() % 100 + 1)); + } else { + initialize_tensors(ctx); + } + ggml_set_f32(src1, (rand() % 100 + 1)); + //ggml_set_f32(dst, 0.0f); + } + + ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr); + + if (get_tensor_data_size(dst) < (32 * 32)) { + QNN_LOG_DEBUG("dump tensors:\n"); + TENSOR_DUMP(src0); + TENSOR_DUMP(src1); + TENSOR_DUMP(dst); + } else { + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + } + + ggml_free(ctx); + ggml_backend_buffer_free(buffer); + ggml_backend_free(backend); + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); + return 0; +} + + +int main(int argc, char * argv[]) { + int num_threads = 4; + int n_backend_type = QNN_BACKEND_CPU; + int n_ggml_op_type = GGML_OP_ADD; for (int i = 1; i < argc; i++) { if (0 == strcmp(argv[i], "-t")) { @@ -330,121 +545,8 @@ int main(int argc, char * argv[]) { } QNN_LOG_DEBUG("enter qnn_ggml_op\n"); - QNN_LOG_DEBUG("ggml op:%d(%s)", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); - - n_begin_time = ggml_time_us(); - srand(time(NULL)); - - ctx_size += 1024 * 1024 * 32; - QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size, - (ctx_size / 1024 / 1024)); - - struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, - /* no_alloc =*/ 0 - }; - - if (n_backend_type != QNN_BACKEND_GGML) { - params.no_alloc = true; - backend = ggml_backend_qnn_init(n_backend_type, "/data/local/tmp/"); - if (nullptr == backend) { - QNN_LOG_ERROR("create qnn backend %d(%s) failed", n_backend_type, get_qnn_backend_name(n_backend_type)); - return 1; - } - } - - ctx = ggml_init(params); - if (!ctx) { - QNN_LOG_ERROR("%s: ggml_init() failed\n"); - return 2; - } - - QNN_LOG_DEBUG("creating new tensors\n"); - QNN_LOG_DEBUG("ggml_blck_size(%s) %d", ggml_type_name(qtype), ggml_blck_size(qtype)); - QNN_LOG_DEBUG("ggml_type_size(%s) %d", ggml_type_name(qtype), ggml_type_size(qtype)); - if (qtype != GGML_TYPE_F32) { - sizex = ggml_blck_size(qtype); - } - - src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - ggml_set_input(src0); - src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - ggml_set_input(src1); - - switch (n_ggml_op_type) { - case GGML_OP_ADD: - dst = ggml_add(ctx, src0, src1); - break; - case GGML_OP_MUL: - dst = ggml_mul(ctx, src0, src1); - break; - case GGML_OP_MUL_MAT: - dst = ggml_mul_mat(ctx, src0, src1); - break; - default: - QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, - ggml_op_name((enum ggml_op) n_ggml_op_type)); - ggml_free(ctx); - ggml_backend_free(backend); - return 3; - } - - ggml_set_output(dst); -#ifdef GGML_USE_QNN - if (n_backend_type != QNN_BACKEND_GGML) { - buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); - if (!buffer) { - QNN_LOG_ERROR("%s: failed to allocate backend buffer\n", __func__); - ggml_free(ctx); - ggml_backend_free(backend); - return 4; - } - } -#endif - - QNN_LOG_DEBUG("creating compute graph\n"); - gf = ggml_new_graph(ctx); - ggml_build_forward_expand(gf, dst); - -#if 0 - ggml_set_f32(src0, (rand() % 100 + 1)); - ggml_set_f32(src1, (rand() % 100 + 1)); - ggml_set_f32(dst, 0.0f); -#else - if (n_backend_type != QNN_BACKEND_GGML) { - initialize_tensors(ctx); - } -#endif - - ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr); - if (get_tensor_data_size(dst) < (32 * 32)) { - QNN_LOG_DEBUG("dump tensors:\n"); - TENSOR_DUMP(src0); - TENSOR_DUMP(src1); - TENSOR_DUMP(dst); - } else { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, - src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, - src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, - dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); - } - - ggml_free(ctx); - ggml_backend_buffer_free(buffer); - ggml_backend_free(backend); - - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); + QNN_LOG_DEBUG("backend %d, ggml op:%d(%s)", n_backend_type, n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); + qnn_op_ut(num_threads, n_backend_type, n_ggml_op_type); return 0; }