From 5269e082aa479de382fefde7518a84036c1b6b7f Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Tue, 11 Jun 2024 23:05:00 +0800 Subject: [PATCH] ggml-qnn: refine ggml inference using QNN NPU --- ggml-qnn.cpp | 264 ++++++++++++------------ tests/ggml-qnn/ggml-qnn-ut-build-run.sh | 10 +- tests/ggml-qnn/ggml-qnn-ut.cpp | 42 ++-- 3 files changed, 156 insertions(+), 160 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 43a8fcd3e..4700e1451 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -55,6 +55,7 @@ #include "Saver/QnnSaver.h" #include "System/QnnSystemInterface.h" #include "HTP/QnnHtpDevice.h" +#include // ================================================================================================= // @@ -72,9 +73,16 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor); // self-defined macro / data structure // // ================================================================================================= -#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend +#ifdef NDEBUG +#define ENABLE_QNNBACKEND_DEBUG 0 // for troubleshooting QNN backend #define ENABLE_QNNSDK_LOG 0 // enable/disable QNN SDK's internal log #define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info +#else +#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend +#define ENABLE_QNNSDK_LOG 1 // enable/disable QNN SDK's internal log +#define ENABLE_QNNBACKEND_PERF 1 // enable/disable op's perf info +#endif + #define QNN_LOGBUF_LEN 4096 #define QNN_BACKEND_NAME "qnn" @@ -393,7 +401,6 @@ static void qnn_internal_log(ggml_log_level level, const char * file, } } - static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { @@ -438,8 +445,8 @@ public: void info() { _end_time = ggml_time_us(); - _duration = (_end_time - _begin_time) / 1000; - QNN_LOG_DEBUG("duration of %s : %lld milliseconds\n", _perf_name.c_str(), _duration); + _duration = (_end_time - _begin_time); + QNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); } private: @@ -473,15 +480,15 @@ enum qnn_sdk_profile_level { profile_detail = 2 }; -using _pfn_rpc_mem_init = void (*)(void); -using _pfn_rpc_mem_deinit = void (*)(void); -using _pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int); -using _pfn_rpc_mem_free = void (*)(void *); -using _pfn_rpc_mem_to_fd = int (*)(void *); +using pfn_rpc_mem_init = void (*)(void); +using pfn_rpc_mem_deinit = void (*)(void); +using pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int); +using pfn_rpc_mem_free = void (*)(void *); +using pfn_rpc_mem_to_fd = int (*)(void *); -using _pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); -using _pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); -using _pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); +using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); +using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); +using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); #define QNN_VER_PTR(x) (&((x).v1)) #define RPCMEM_DEFAULT_FLAGS 1 @@ -702,7 +709,7 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; Qnn_AxisScaleOffset_t & axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; - Qnn_ScaleOffset_t **scaleOffset = & axis_scale_offset.scaleOffset; + Qnn_ScaleOffset_t ** scaleOffset = & axis_scale_offset.scaleOffset; size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); *scaleOffset = (Qnn_ScaleOffset_t *) malloc(scaleOffsetSize); memscpy(*scaleOffset, scaleOffsetSize, @@ -732,8 +739,8 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { uint32_t rank = QNN_TENSOR_GET_RANK(src); QNN_TENSOR_SET_RANK(dst, rank); - size_t dim_size = rank * sizeof(uint32_t); - uint32_t *dimensions = (uint32_t *) malloc(dim_size); + size_t dim_size = rank * sizeof(uint32_t); + uint32_t * dimensions = (uint32_t *) malloc(dim_size); if (dimensions == nullptr) { QNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying " "tensor %s\n", @@ -1072,26 +1079,26 @@ class qnn_instance { QNN_LOG_DEBUG("load rpcmem lib successfully\n"); set_rpcmem_initialized(true); } - __pfn_rpc_mem_init = reinterpret_cast<_pfn_rpc_mem_init>( + _pfn_rpc_mem_init = reinterpret_cast( dlsym(_rpc_lib_handle, "rpcmem_init")); - __pfn_rpc_mem_deinit = reinterpret_cast<_pfn_rpc_mem_deinit>( + _pfn_rpc_mem_deinit = reinterpret_cast( dlsym(_rpc_lib_handle, "rpcmem_deinit")); - __pfn_rpc_mem_alloc = reinterpret_cast<_pfn_rpc_mem_alloc>( + _pfn_rpc_mem_alloc = reinterpret_cast( dlsym(_rpc_lib_handle, "rpcmem_alloc")); - __pfn_rpc_mem_free = reinterpret_cast<_pfn_rpc_mem_free>( + _pfn_rpc_mem_free = reinterpret_cast( dlsym(_rpc_lib_handle, "rpcmem_free")); - __pfn_rpc_mem_to_fd = reinterpret_cast<_pfn_rpc_mem_to_fd>( + _pfn_rpc_mem_to_fd = reinterpret_cast( dlsym(_rpc_lib_handle, "rpcmem_to_fd")); - if (nullptr == __pfn_rpc_mem_alloc || nullptr == __pfn_rpc_mem_free || - nullptr == __pfn_rpc_mem_to_fd) { + if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || + nullptr == _pfn_rpc_mem_to_fd) { QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); dlclose(_rpc_lib_handle); return 9; } if (nullptr != - __pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy - __pfn_rpc_mem_init(); + _pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy + _pfn_rpc_mem_init(); std::vector temp_context_config; _qnn_interface.qnn_context_create( @@ -1124,7 +1131,6 @@ class qnn_instance { } _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); - //TODO: faster approach to probe the accurate capacity of rpc ion memory size_t candidate_size = 0; uint8_t * rpc_buffer = nullptr; @@ -1145,6 +1151,16 @@ class qnn_instance { if (candidate_size > _rpcmem_capacity) _rpcmem_capacity = candidate_size; QNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity); + + if (0 != init_htp_perfinfra()) { + QNN_LOG_WARN("initialize HTP performance failure"); + } + if (0 != set_rpc_polling()) { + QNN_LOG_WARN("set RPC polling failure"); + } + if (0 != set_high_performance_mode()) { + QNN_LOG_WARN("set HTP high performance mode failure"); + } } QNN_LOG_DEBUG("leave qni_init\n"); @@ -1156,9 +1172,8 @@ class qnn_instance { int ret_status = 0; Qnn_ErrorHandle_t error = QNN_SUCCESS; - if (nullptr != - __pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy - __pfn_rpc_mem_deinit(); + if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy + _pfn_rpc_mem_deinit(); if (dlclose(_rpc_lib_handle) != 0) { QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); @@ -1325,6 +1340,8 @@ class qnn_instance { if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to get qnn device infra\n"); return 1; + } else { + QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n"); } QnnHtpDevice_Infrastructure_t * htp_infra = static_cast(device_infra); @@ -1333,6 +1350,11 @@ class qnn_instance { uint32_t device_id = 0; uint32_t core_id = 0; htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); + if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) { + QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type", htp_infra->infraType); + } else { + QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType); + } _qnn_htp_perfinfra = htp_perfinfra; _qnn_power_configid = power_configid; @@ -1343,14 +1365,17 @@ class qnn_instance { if (_qnn_rpc_pollingtime > 0) { QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingTime; memset(&rpc_pollingTime, 0, sizeof(rpc_pollingTime)); - rpc_pollingTime.option = - QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + rpc_pollingTime.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; rpc_pollingTime.rpcPollingTimeConfig = _qnn_rpc_pollingtime; - const QnnHtpPerfInfrastructure_PowerConfig_t * powerConfigs[] = { - &rpc_pollingTime, nullptr}; + + QnnHtpPerfInfrastructure_PowerConfig_t rpc_ControlLatency; + memset(&rpc_ControlLatency, 0, sizeof(rpc_ControlLatency)); + rpc_ControlLatency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; + rpc_ControlLatency.rpcControlLatencyConfig = 40; + + const QnnHtpPerfInfrastructure_PowerConfig_t * powerConfigs[] = {&rpc_pollingTime, &rpc_ControlLatency, nullptr}; if (_qnn_htp_perfinfra) { - _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, - powerConfigs); + _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); } } return 0; @@ -1426,7 +1451,7 @@ class qnn_instance { } auto allocate_bytes = static_cast(bytes + alignment); - void * buf = __pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, + void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); if (buf == nullptr) { QNN_LOG_WARN("failed to allocate rpc memory\n"); @@ -1439,7 +1464,7 @@ class qnn_instance { _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; if (!status) { QNN_LOG_WARN("failed to allocate rpc memory\n"); - __pfn_rpc_mem_free(buf); + _pfn_rpc_mem_free(buf); } return aligned_buf; @@ -1451,7 +1476,7 @@ class qnn_instance { } else if (0 == _rpcmem_store_map.count(buf)) { QNN_LOG_WARN("no allocated tensor\n"); } else { - __pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _pfn_rpc_mem_free(_rpcmem_store_map[buf]); _rpcmem_store_map.erase(buf); } } @@ -1461,7 +1486,7 @@ class qnn_instance { if (!is_rpcmem_initialized()) { QNN_LOG_WARN("rpc memory not initialized\n"); } else { - mem_fd = __pfn_rpc_mem_to_fd(buf); + mem_fd = _pfn_rpc_mem_to_fd(buf); } return mem_fd; @@ -1560,7 +1585,7 @@ class qnn_instance { } auto * get_providers = - reinterpret_cast<_pfn_qnnsysteminterface_getproviders *>( + reinterpret_cast( dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); if (nullptr == get_providers) { QNN_LOG_WARN( @@ -1661,7 +1686,7 @@ class qnn_instance { return 1; } - auto get_providers = load_qnn_functionpointers<_pfn_qnninterface_getproviders *>( + auto get_providers = load_qnn_functionpointers( lib_handle, "QnnInterface_getProviders"); if (nullptr == get_providers) { QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", @@ -1805,11 +1830,11 @@ class qnn_instance { void * _rpc_lib_handle = nullptr; std::atomic_bool _rpcmem_initialized{false}; - _pfn_rpc_mem_alloc __pfn_rpc_mem_alloc; - _pfn_rpc_mem_free __pfn_rpc_mem_free; - _pfn_rpc_mem_to_fd __pfn_rpc_mem_to_fd; - _pfn_rpc_mem_init __pfn_rpc_mem_init; - _pfn_rpc_mem_deinit __pfn_rpc_mem_deinit; + pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + pfn_rpc_mem_free _pfn_rpc_mem_free; + pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + pfn_rpc_mem_init _pfn_rpc_mem_init; + pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; std::unordered_map _rpcmem_store_map; size_t _rpcmem_capacity = 512; @@ -1824,101 +1849,63 @@ class qnn_instance { static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, const struct ggml_tensor * tensor, bool b_dump_tensor_info) { - // only support the following 3 OPs currently - // provide a GENERAL approach could fix this problem in a standalone PR of refine ggml backend - // subsystem for mixed inference between CPU&GPU / CPU&NPU easily for ANY ggml backends - // which the backend's ggml_backend_xxx_buffer_is_host return true. - // this approach could be found: - // https://github.com/ggerganov/llama.cpp/pull/7641 - bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) - || (tensor->op == GGML_OP_MUL_MAT)); - if (!supported_op) { - return false; - } - - const struct ggml_tensor * src0 = tensor->src[0]; - const struct ggml_tensor * src1 = tensor->src[1]; - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne20 = tensor->ne[0]; - const int64_t ne21 = tensor->ne[1]; - - //TODO: support other quantized data type - if (ggml_is_quantized(src0->type)) { - if ((src0->type != GGML_TYPE_Q8_0) && (src0->type != GGML_TYPE_Q4_0)) { - return false; - } - } - - if (b_dump_tensor_info) { - if (tensor->op == GGML_OP_MUL_MAT) { - QNN_LOG_DEBUG("GGML_OP_MUL_MAT"); - QNN_LOG_DEBUG("op name:%s, tensor type:%s", - ggml_op_name(tensor->op), - ggml_type_name(tensor->type)); - QNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type)); - QNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type)); - QNN_LOG_DEBUG("src0 %15s: type = %i (%5s) ne = %5" PRIi64 - " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("src1 %15s: type = %i (%5s) ne = %5" PRIi64 - " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG( - " %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - tensor->name, tensor->type, ggml_type_name(tensor->type), - tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], - tensor->nb[1], tensor->nb[2]); - } - } - if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) { return false; } - // make qnn_get_ggml_tensor_rank and QNN SDK happy - if ((ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1)) { + const struct ggml_tensor * src0 = tensor->src[0]; + const struct ggml_tensor * src1 = tensor->src[1]; + if (nullptr == src0 || nullptr == src1) { return false; } - int qtype = src0->type; - if (tensor->op == GGML_OP_ADD) { - return (qtype == GGML_TYPE_F32 || qtype == GGML_TYPE_F16 || - qtype == GGML_TYPE_Q8_0) && - (src1->type == GGML_TYPE_F32); + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + // make qnn_get_ggml_tensor_rank and QNN SDK happy + if (ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1) { + return false; } + // TODO: support other GGML OPs using QNN API + // a GENERAL approach could fix this problem in a standalone PR of refine ggml backend + // subsystem for mixed inference between CPU&GPU / CPU&NPU easily for ANY ggml backends + // which the backend's ggml_backend_xxx_buffer_is_host return true. + // this approach could be found: + // https://github.com/ggerganov/llama.cpp/pull/7641 + bool supported_op = false; + supported_op = (tensor->op == GGML_OP_ADD); + supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT)); + if (!supported_op) { + return false; + } + + //TODO: support other quantized data type + if (ggml_is_quantized(src0->type)) { + if (src0->type != GGML_TYPE_Q8_0 && src0->type != GGML_TYPE_Q4_0) { + return false; + } + } + + int qtype = src0->type; if (tensor->op == GGML_OP_MUL) { return (qtype == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32); } if (tensor->op == GGML_OP_MUL_MAT) { - if (ctx->device == QNN_BACKEND_GGML) { - return (ne00 == ne10) && (src1->ne[2] % src0->ne[2] == 0) && - (src1->ne[3] % src0->ne[3] == 0); - } - if ((ctx->device == QNN_BACKEND_NPU) && (qtype == GGML_TYPE_Q8_0) && - (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32)) { + if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) { + return false; + } else { return true; } - if (ctx->device == QNN_BACKEND_CPU || ctx->device == QNN_BACKEND_GPU) { - return (ne00 == ne10) && (ne00 == ne01); - } - return false; } -} + return true; +} static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -1978,10 +1965,25 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; - QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); + QNN_LOG_INFO("graph name %s", graph_name.c_str()); + if (ctx->device == QNN_BACKEND_NPU) { + QnnHtpGraph_CustomConfig_t custom_config; + custom_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + custom_config.numHvxThreads = 8; + + QnnGraph_Config_t graph_config; + graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_config.customConfig = &custom_config; + const QnnGraph_Config_t * p_graphconfig[] = {&graph_config, NULL}; + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, + &graph_handle); + } else { + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); + } + if (QNN_SUCCESS != error) { QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " "error = %d\n", @@ -2112,8 +2114,6 @@ failure: dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); } QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; @@ -2198,7 +2198,7 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; - QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); + QNN_LOG_INFO("graph name %s", graph_name.c_str()); error = qnn_raw_interface.graphCreate( instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); @@ -2331,8 +2331,6 @@ failure: " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); } QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; @@ -2894,7 +2892,6 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t Qnn_ErrorHandle_t error = QNN_SUCCESS; ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; - static int idx = 0; char tensor_name[GGML_MAX_NAME] = {0}; snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); @@ -3061,7 +3058,7 @@ GGML_CALL static const char * ggml_backend_qnn_name(ggml_backend_t backend) { GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { QNN_LOG_INFO("enter %s", __func__); ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; - QNN_LOG_DEBUG("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); + QNN_LOG_INFO("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); qnn_instance * instance = (qnn_instance *)g_qnn_mgr[ctx->device].instance; if (instance != nullptr) { @@ -3073,7 +3070,7 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { auto & graph_item = graph_it->second; Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item); GGML_UNUSED(graph_handle); - QNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str()); + QNN_LOG_INFO("graph type:%s", graph_it->first.c_str()); } instance->_qnn_graph_map.clear(); @@ -3104,7 +3101,7 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe params.type = GGML_TASK_TYPE_COMPUTE; params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor *node = cgraph->nodes[i]; + ggml_tensor * node = cgraph->nodes[i]; if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { @@ -3213,7 +3210,6 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { device, GGML_QNN_MAX_DEVICES - 1); return nullptr; } - static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; static bool ggml_backend_qnn_buffer_type_initialized = false; diff --git a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh index 192f2f4bd..4c21be5a4 100755 --- a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh +++ b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh @@ -12,6 +12,8 @@ ANDROID_PLATFORM=android-34 GGML_QNN_UT=ggml-qnn-ut REMOTE_PATH=/data/local/tmp/ +BUILDTYPE=Debug +BUILDTYPE=Release function dump_vars() @@ -70,7 +72,7 @@ function check_and_download_ndk() function build_arm64 { - cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${GGML_QNN_UT} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH} + cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${GGML_QNN_UT} -DCMAKE_BUILD_TYPE=${BUILDTYPE} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH} cd ./out/arm64-v8a make @@ -166,9 +168,9 @@ function show_usage() echo "Usage:" echo " $0 build (build Android command line UT program)" echo " $0 updateqnnlibs (upload the latest QNN libs to Android phone)" - echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" - echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" - echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" + echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" + echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" + echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" echo -e "\n\n\n" } diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index eb072beae..9af433ceb 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -72,14 +72,12 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args); if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { - //for Android command line application or WoA printf("%s\n", s_ggml_qnn_log_internal_buf); } va_end(args); } } - static const char * get_qnn_backend_name(int n_backend_type) { switch (n_backend_type) { case 0: @@ -95,7 +93,6 @@ static const char * get_qnn_backend_name(int n_backend_type) { } } - static bool ggml_graph_compute_helper( struct ggml_backend * backend, struct ggml_cgraph * graph, @@ -123,26 +120,25 @@ static bool ggml_graph_compute_helper( } #endif - //a new approch of mixed inference if (nullptr != backend) return ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS; else return ggml_graph_compute(graph, &plan); } - #define QK8_0 32 + typedef struct { uint16_t d; // delta int8_t qs[QK8_0]; // quants } block_q8_0; - static inline float ggml_compute_fp16_to_fp32(uint16_t h) { __fp16 tmp; memcpy(&tmp, &h, sizeof(uint16_t)); return (float)tmp; } + #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) static void tensor_dump(const ggml_tensor * tensor, const char * name) { @@ -245,7 +241,6 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) { } } - static uint32_t get_tensor_rank(const ggml_tensor * tensor) { uint32_t rank = 0; for (int i = 0; i < GGML_MAX_DIMS; i++) { @@ -256,7 +251,6 @@ static uint32_t get_tensor_rank(const ggml_tensor * tensor) { return rank; } - static uint32_t get_tensor_data_size(const ggml_tensor * tensor) { size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); size_t n_dims = get_tensor_rank(tensor); @@ -270,7 +264,6 @@ static uint32_t get_tensor_data_size(const ggml_tensor * tensor) { return ggml_nbytes(tensor); } - //ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20 static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) { // static RNG initialization (revisit if n_threads stops being constant) @@ -305,8 +298,11 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m t.join(); } if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { - //ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); +#ifdef GGML_USE_QNN memcpy((char*)tensor->data, data.data(), size * sizeof(float)); +#else + ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); +#endif } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) { GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); std::vector dataq(ggml_row_size(tensor->type, size)); @@ -321,18 +317,23 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m } ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im); GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size())); - //ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); +#ifdef GGML_USE_QNN memcpy((char*)tensor->data, dataq.data(), dataq.size()); +#else + ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); +#endif } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { // This is going to create some weird integers though. - //ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); +#ifdef GGML_USE_QNN memcpy((char*)tensor->data, data.data(), ggml_nbytes(tensor)); +#else + ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); +#endif } else { GGML_ASSERT(false); } } - //ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L310 static void initialize_tensors(ggml_context * ctx) { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { @@ -340,19 +341,17 @@ static void initialize_tensors(ggml_context * ctx) { } } - static void show_usage() { printf(" " \ "\nUsage: test_qnn_ops [options]\n" \ "\n" \ "Options:\n" \ " -t GGML_OP_ADD / GGML_OP_MUL / GGML_OP_MULMAT\n" \ - " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU)\n" \ + " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(ggml)\n" \ " ?/h print usage infomation\n\n" ); } - static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { int64_t n_begin_time = 0LL; int64_t n_end_time = 0LL; @@ -369,16 +368,15 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { ggml_backend_t backend = nullptr; ggml_backend_buffer_t buffer= nullptr; - ggml_type qtype = GGML_TYPE_I8; - qtype = GGML_TYPE_F32; + ggml_type qtype = GGML_TYPE_I8; qtype = GGML_TYPE_F16; qtype = GGML_TYPE_Q8_0; + qtype = GGML_TYPE_F32; std::vector work_buffer; QNN_LOG_DEBUG("enter qnn_ggml_op\n"); QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); - n_begin_time = ggml_time_us(); srand(time(NULL)); @@ -473,7 +471,6 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { initialize_tensors(ctx); } ggml_set_f32(src1, (rand() % 100 + 1)); - //ggml_set_f32(dst, 0.0f); } ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr); @@ -501,13 +498,13 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { ggml_free(ctx); ggml_backend_buffer_free(buffer); ggml_backend_free(backend); + n_end_time = ggml_time_us(); n_duration = (n_end_time - n_begin_time) / 1000; QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); return 0; } - int main(int argc, char * argv[]) { int num_threads = 4; int n_backend_type = QNN_BACKEND_CPU; @@ -531,7 +528,7 @@ int main(int argc, char * argv[]) { } else if (0 == strcmp(argv[i], "-b")) { if (i + 1 < argc) { int backend = atoi(argv[i + 1]); - if (backend <= QNN_BACKEND_NPU) + if (backend <= QNN_BACKEND_GGML) n_backend_type = backend; else { show_usage(); @@ -549,5 +546,6 @@ int main(int argc, char * argv[]) { QNN_LOG_DEBUG("backend %d, ggml op:%d(%s)", n_backend_type, n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); qnn_op_ut(num_threads, n_backend_type, n_ggml_op_type); + return 0; }