review: make a MVP(Minimum Viable PR) style PR in upstream
This commit is contained in:
parent
faaa86b7e4
commit
5598fbd15d
3 changed files with 185 additions and 443 deletions
601
ggml-qnn.cpp
601
ggml-qnn.cpp
|
@ -55,7 +55,7 @@
|
|||
#include "Saver/QnnSaver.h"
|
||||
#include "System/QnnSystemInterface.h"
|
||||
#include "HTP/QnnHtpDevice.h"
|
||||
#include <HTP/QnnHtpGraph.h>
|
||||
#include "HTP/QnnHtpGraph.h"
|
||||
|
||||
// =================================================================================================
|
||||
//
|
||||
|
@ -91,12 +91,6 @@ typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx,
|
|||
const ggml_tensor * src1,
|
||||
ggml_tensor * dst);
|
||||
|
||||
typedef void (*ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx,
|
||||
const ggml_op ggml_op,
|
||||
const ggml_tensor * src0,
|
||||
const ggml_tensor * src1,
|
||||
ggml_tensor * dst);
|
||||
|
||||
enum qcom_htp_arch {
|
||||
NONE = 0,
|
||||
V68 = 68,
|
||||
|
@ -424,6 +418,7 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso
|
|||
return true;
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
#define CHECK_PARAMS(ctx, src0, src1, dst) \
|
||||
do { \
|
||||
if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \
|
||||
|
@ -431,6 +426,10 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso
|
|||
} \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
#define CHECK_PARAMS(ctx, src0, src1, dst)
|
||||
#endif
|
||||
|
||||
#if ENABLE_QNNBACKEND_PERF
|
||||
class qnn_perf {
|
||||
public:
|
||||
|
@ -446,7 +445,7 @@ public:
|
|||
void info() {
|
||||
_end_time = ggml_time_us();
|
||||
_duration = (_end_time - _begin_time);
|
||||
QNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
|
||||
QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -809,7 +808,7 @@ static void qnn_sdk_logcallback(const char * fmt, QnnLog_Level_t level,
|
|||
|
||||
memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN);
|
||||
vsnprintf(reinterpret_cast<char *const>(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp);
|
||||
QNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf);
|
||||
QNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -1069,7 +1068,7 @@ class qnn_instance {
|
|||
arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
|
||||
arch_devconfig.customConfig = &arch_customconfig;
|
||||
|
||||
const QnnDevice_Config_t * p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, NULL};
|
||||
const QnnDevice_Config_t * p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, nullptr};
|
||||
qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle);
|
||||
} else {
|
||||
qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle);
|
||||
|
@ -1137,10 +1136,14 @@ class qnn_instance {
|
|||
_pfn_rpc_mem_init();
|
||||
}
|
||||
|
||||
std::vector<const QnnContext_Config_t *> temp_context_config;
|
||||
/* TODO: not used, keep it for further usage
|
||||
QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT;
|
||||
qnn_context_config.priority = QNN_PRIORITY_DEFAULT;
|
||||
const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr};
|
||||
*/
|
||||
_qnn_interface.qnn_context_create(
|
||||
_qnn_backend_handle, _qnn_device_handle,
|
||||
temp_context_config.empty() ? nullptr : temp_context_config.data(),
|
||||
nullptr,
|
||||
&_qnn_context_handle);
|
||||
if (nullptr == _qnn_context_handle) {
|
||||
QNN_LOG_WARN("why failed to initialize qnn context\n");
|
||||
|
@ -1157,9 +1160,11 @@ class qnn_instance {
|
|||
size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048};
|
||||
size_t probe_counts = sizeof(probe_slots) / sizeof(size_t);
|
||||
for (size_t idx = 0; idx < probe_counts; idx++) {
|
||||
rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem(probe_slots[idx] * size_in_mb, 4));
|
||||
rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem(
|
||||
probe_slots[idx] * size_in_mb, 4));
|
||||
if (nullptr == rpc_buffer) {
|
||||
QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
|
||||
QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n",
|
||||
probe_slots[idx], strerror(errno));
|
||||
break;
|
||||
} else {
|
||||
candidate_size = probe_slots[idx];
|
||||
|
@ -1262,8 +1267,8 @@ class qnn_instance {
|
|||
return ret_status;
|
||||
}
|
||||
|
||||
//keep it for further usage of offload the entire cgraph to a single QNN DAG directly
|
||||
//which was used in Qualcomm's dedicated AI technology
|
||||
//TODO:keep it for further usage of offload the entire cgraph to a single QNN DAG directly
|
||||
// which was used in Qualcomm's dedicated AI technology
|
||||
#if 0
|
||||
int init_qnn_graph(const char * graph_name, bool debug,
|
||||
uint8_t do_node_validation = true,
|
||||
|
@ -1430,13 +1435,14 @@ class qnn_instance {
|
|||
QnnHtpPerfInfrastructure_PowerConfig_t power_config;
|
||||
memset(&power_config, 0, sizeof(power_config));
|
||||
power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
|
||||
power_config.dcvsV3Config.dcvsEnable = 0;
|
||||
|
||||
power_config.dcvsV3Config.setDcvsEnable = 1;
|
||||
power_config.dcvsV3Config.dcvsEnable = 0;
|
||||
power_config.dcvsV3Config.contextId = _qnn_power_configid;
|
||||
power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
|
||||
power_config.dcvsV3Config.setSleepLatency =
|
||||
1; // true to consider Latency parameter otherwise false
|
||||
power_config.dcvsV3Config.sleepLatency = 10;
|
||||
power_config.dcvsV3Config.sleepLatency = 40;
|
||||
power_config.dcvsV3Config.setBusParams =
|
||||
1; // true to consider Bus parameter otherwise false
|
||||
power_config.dcvsV3Config.setCoreParams =
|
||||
|
@ -1459,6 +1465,7 @@ class qnn_instance {
|
|||
DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
|
||||
power_config.dcvsV3Config.coreVoltageCornerMax =
|
||||
DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
|
||||
|
||||
// set power config with different performance parameters
|
||||
const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {
|
||||
&power_config, nullptr};
|
||||
|
@ -1550,6 +1557,7 @@ class qnn_instance {
|
|||
QNN_LOG_WARN("rpc memory already allocated\n");
|
||||
return 3;
|
||||
}
|
||||
|
||||
if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) {
|
||||
QNN_LOG_WARN("tensor %s has been registered shared memory\n",
|
||||
(QNN_VER_PTR(*p_tensor)->name));
|
||||
|
@ -1710,7 +1718,7 @@ class qnn_instance {
|
|||
int result = 0;
|
||||
|
||||
if (nullptr == _system_lib_handle) {
|
||||
QNN_LOG_DEBUG("system lib handle is null\n");
|
||||
QNN_LOG_WARN("system lib handle is null\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -1724,8 +1732,7 @@ class qnn_instance {
|
|||
|
||||
int dlclose_error = dlclose(_system_lib_handle);
|
||||
if (dlclose_error != 0) {
|
||||
QNN_LOG_WARN("failed to close QnnSystem library, error %s\n",
|
||||
dlerror());
|
||||
QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror());
|
||||
return 2;
|
||||
}
|
||||
|
||||
|
@ -1740,8 +1747,7 @@ class qnn_instance {
|
|||
|
||||
void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
|
||||
if (nullptr == lib_handle) {
|
||||
QNN_LOG_WARN("can not open QNN library %s, with error: %s",
|
||||
lib_path.c_str(), dlerror());
|
||||
QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror());
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -1749,8 +1755,7 @@ class qnn_instance {
|
|||
load_qnn_functionpointers<pfn_qnninterface_getproviders *>(
|
||||
lib_handle, "QnnInterface_getProviders");
|
||||
if (nullptr == get_providers) {
|
||||
QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s",
|
||||
dlerror());
|
||||
QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror());
|
||||
return 2;
|
||||
}
|
||||
|
||||
|
@ -1758,14 +1763,12 @@ class qnn_instance {
|
|||
const QnnInterface_t ** provider_list = nullptr;
|
||||
error = get_providers(&provider_list, &num_providers);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("failed to get providers, error %d",
|
||||
QNN_GET_ERROR_CODE(error));
|
||||
QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error));
|
||||
return 3;
|
||||
}
|
||||
QNN_LOG_DEBUG("num_providers=%d\n", num_providers);
|
||||
if (num_providers != _required_num_providers) {
|
||||
QNN_LOG_WARN("providers is %d instead of required %d", num_providers,
|
||||
_required_num_providers);
|
||||
QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers);
|
||||
return 4;
|
||||
}
|
||||
|
||||
|
@ -1797,16 +1800,14 @@ class qnn_instance {
|
|||
BackendIdType backend_id = provider_list[0]->backendId;
|
||||
_lib_path_to_backend_id[lib_path] = backend_id;
|
||||
if (_loaded_backend.count(backend_id) > 0) {
|
||||
QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n",
|
||||
lib_path.c_str(), backend_id);
|
||||
QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id);
|
||||
}
|
||||
_loaded_backend[backend_id] = provider_list[0];
|
||||
if (_loaded_lib_handle.count(backend_id) > 0) {
|
||||
QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]);
|
||||
int dlclose_error = dlclose(_loaded_lib_handle[backend_id]);
|
||||
if (dlclose_error != 0) {
|
||||
QNN_LOG_WARN("fail to close %p with error %s\n",
|
||||
_loaded_lib_handle[backend_id], dlerror());
|
||||
QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror());
|
||||
}
|
||||
}
|
||||
_loaded_lib_handle[backend_id] = lib_handle;
|
||||
|
@ -1820,8 +1821,7 @@ class qnn_instance {
|
|||
for (auto & it : _loaded_lib_handle) {
|
||||
dlclose_error = dlclose(it.second);
|
||||
if (dlclose_error != 0) {
|
||||
QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first,
|
||||
dlerror());
|
||||
QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1924,7 +1924,6 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx,
|
|||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
|
||||
// make qnn_get_ggml_tensor_rank and QNN SDK happy
|
||||
if (ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1) {
|
||||
return false;
|
||||
|
@ -1932,13 +1931,13 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx,
|
|||
|
||||
// TODO: support other GGML OPs using QNN API
|
||||
// a GENERAL approach could fix this problem in a standalone PR of refine ggml backend
|
||||
// subsystem for mixed inference between CPU&GPU / CPU&NPU easily for ANY ggml backends
|
||||
// which the backend's ggml_backend_xxx_buffer_is_host return true.
|
||||
// this approach could be found:
|
||||
// subsystem for hybrid inference between CPU&GPU / CPU&NPU easily(less the 100 LoC and no
|
||||
// side-effect to the existing codes) for ANY ggml backends which the backend's
|
||||
// ggml_backend_xxx_buffer_is_host return true. this approach could be found at:
|
||||
// https://github.com/ggerganov/llama.cpp/pull/7641
|
||||
bool supported_op = false;
|
||||
supported_op = (tensor->op == GGML_OP_ADD);
|
||||
supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT));
|
||||
supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL_MAT));
|
||||
if (!supported_op) {
|
||||
return false;
|
||||
}
|
||||
|
@ -1950,14 +1949,9 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx,
|
|||
}
|
||||
}
|
||||
|
||||
int qtype = src0->type;
|
||||
if (tensor->op == GGML_OP_MUL) {
|
||||
return (qtype == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
if (tensor->op == GGML_OP_MUL_MAT) {
|
||||
if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) {
|
||||
//make mul_mat with QNN RPC happy
|
||||
//comment it for make UT of mul_mat with QNN RPC happy
|
||||
//return false;
|
||||
}
|
||||
}
|
||||
|
@ -1965,6 +1959,8 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx,
|
|||
return true;
|
||||
}
|
||||
|
||||
//TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat
|
||||
// keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC
|
||||
static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
|
||||
const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
Qnn_ErrorHandle_t error = QNN_SUCCESS;
|
||||
|
@ -1986,10 +1982,11 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
|
|||
tensor_1 = (Qnn_Tensor_t *) src1->extra;
|
||||
tensor_2 = (Qnn_Tensor_t *) dst->extra;
|
||||
instance = ctx->instance;
|
||||
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
|
||||
|
||||
qnn_perf perf("ggml_qnn_add");
|
||||
perf.start();
|
||||
|
||||
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
|
||||
QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||
QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||
QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
|
||||
|
@ -2034,17 +2031,31 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
|
|||
|
||||
QnnHtpGraph_CustomConfig_t dlbc_config;
|
||||
dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
|
||||
/*
|
||||
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
|
||||
dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC
|
||||
*/
|
||||
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
|
||||
|
||||
dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
|
||||
QnnGraph_Config_t graph_dlbc_config;
|
||||
graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_dlbc_config.customConfig = &dlbc_config;
|
||||
|
||||
const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL};
|
||||
QnnHtpGraph_CustomConfig_t opt_config;
|
||||
opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
|
||||
opt_config.optimizationOption.floatValue = 1; // 1 / 3
|
||||
QnnGraph_Config_t graph_opt_config;
|
||||
graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_opt_config.customConfig = &opt_config;
|
||||
|
||||
QnnHtpGraph_CustomConfig_t vtcm_config;
|
||||
vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
|
||||
vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb;
|
||||
QnnGraph_Config_t graph_vtcm_config;
|
||||
graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_vtcm_config.customConfig = &vtcm_config;
|
||||
|
||||
const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config,
|
||||
&graph_dlbc_config,
|
||||
&graph_vtcm_config,
|
||||
&graph_opt_config,
|
||||
NULL};
|
||||
error = qnn_raw_interface.graphCreate(
|
||||
instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig,
|
||||
&graph_handle);
|
||||
|
@ -2113,27 +2124,33 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
|
|||
uint8_t * qnn_buffer_2 = nullptr;
|
||||
qnn_instance * instance = ctx->instance;
|
||||
|
||||
qnn_buffer_0 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src0), 4));
|
||||
qnn_buffer_0 = static_cast<uint8_t *>(instance->alloc_rpcmem(
|
||||
ggml_nbytes(src0), 4));
|
||||
if (nullptr == qnn_buffer_0) {
|
||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
||||
goto failure;
|
||||
} else {
|
||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
||||
}
|
||||
instance->register_rpcmem(qnn_buffer_0, tensor_0);
|
||||
memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
|
||||
|
||||
qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src1), 4));
|
||||
qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(
|
||||
ggml_nbytes(src1), 4));
|
||||
if (nullptr == qnn_buffer_1) {
|
||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
||||
goto failure;
|
||||
} else {
|
||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
||||
}
|
||||
instance->register_rpcmem(qnn_buffer_1, tensor_1);
|
||||
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
|
||||
|
||||
qnn_buffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(dst), 4));
|
||||
qnn_buffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(
|
||||
ggml_nbytes(dst), 4));
|
||||
if (nullptr == qnn_buffer_2) {
|
||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
||||
goto failure;
|
||||
} else {
|
||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
||||
}
|
||||
|
@ -2144,23 +2161,33 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
|
|||
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
|
||||
Qnn_OpConfig_t op_config = {
|
||||
(Qnn_OpConfigVersion_t) 1,
|
||||
.v1 = {"ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_ELEMENT_WISE_ADD, 0, qnn_params,
|
||||
2, tensor_inputs, 1,
|
||||
tensor_outputs}};
|
||||
.v1 = {"ggml_op_add",
|
||||
QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_ELEMENT_WISE_ADD,
|
||||
0, qnn_params,
|
||||
2, tensor_inputs,
|
||||
1,tensor_outputs}
|
||||
};
|
||||
error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
|
||||
if (QNN_SUCCESS != error) {
|
||||
QNN_LOG_INFO("error = %d\n", error);
|
||||
goto failure;
|
||||
}
|
||||
error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr);
|
||||
error = qnn_raw_interface.graphFinalize(graph_handle,
|
||||
nullptr, nullptr);
|
||||
if (QNN_SUCCESS != error) {
|
||||
QNN_LOG_INFO("error = %d\n", error);
|
||||
goto failure;
|
||||
}
|
||||
error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2,
|
||||
error = qnn_raw_interface.graphExecute(graph_handle,
|
||||
tensor_inputs, 2,
|
||||
tensor_outputs, 1,
|
||||
nullptr, nullptr);
|
||||
if (ctx->device == QNN_BACKEND_NPU) {
|
||||
if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) {
|
||||
QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
|
||||
}
|
||||
}
|
||||
if (QNN_SUCCESS != error) {
|
||||
QNN_LOG_INFO("error = %d\n", error);
|
||||
goto failure;
|
||||
|
@ -2221,9 +2248,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
|
|||
|
||||
Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1};
|
||||
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
|
||||
error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2,
|
||||
error = qnn_raw_interface.graphExecute(graph_handle,
|
||||
tensor_inputs,2,
|
||||
tensor_outputs,1,
|
||||
nullptr, nullptr);
|
||||
if (ctx->device == QNN_BACKEND_NPU) {
|
||||
if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) {
|
||||
QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
|
||||
}
|
||||
}
|
||||
if (QNN_SUCCESS != error) {
|
||||
QNN_LOG_INFO("error = %d\n", error);
|
||||
goto failure;
|
||||
|
@ -2299,6 +2332,8 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
|||
tensor_1 = (Qnn_Tensor_t *) src1->extra;
|
||||
tensor_2 = (Qnn_Tensor_t *) dst->extra;
|
||||
instance = ctx->instance;
|
||||
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
|
||||
|
||||
qnn_perf perf("ggml_qnn_mul_mat");
|
||||
perf.start();
|
||||
|
||||
|
@ -2307,7 +2342,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
|||
tensor_2 = (Qnn_Tensor_t *) dst->extra;
|
||||
instance = ctx->instance;
|
||||
|
||||
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
|
||||
QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||
QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||
QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
|
||||
|
@ -2338,6 +2372,11 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
|||
uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
|
||||
uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions;
|
||||
|
||||
//TODO: for scenarios of quantized data in src0
|
||||
// pass-1: dequantize src0 to FP32
|
||||
// pass-2: dq-src0 * src1
|
||||
// the performance gains is worth although there is performance loss in pass-1
|
||||
|
||||
if (!graph_initialized) {
|
||||
graph_name = graph_name + "_" + std::to_string(ctx->threads) +
|
||||
"_" + src0->name + "_" + src1->name;
|
||||
|
@ -2352,17 +2391,31 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
|||
|
||||
QnnHtpGraph_CustomConfig_t dlbc_config;
|
||||
dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
|
||||
/*
|
||||
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
|
||||
dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC
|
||||
*/
|
||||
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
|
||||
|
||||
dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
|
||||
QnnGraph_Config_t graph_dlbc_config;
|
||||
graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_dlbc_config.customConfig = &dlbc_config;
|
||||
|
||||
const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL};
|
||||
QnnHtpGraph_CustomConfig_t opt_config;
|
||||
opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
|
||||
opt_config.optimizationOption.floatValue = 1; //1 / 3
|
||||
QnnGraph_Config_t graph_opt_config;
|
||||
graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_opt_config.customConfig = &opt_config;
|
||||
|
||||
QnnHtpGraph_CustomConfig_t vtcm_config;
|
||||
vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
|
||||
vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb;
|
||||
QnnGraph_Config_t graph_vtcm_config;
|
||||
graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_vtcm_config.customConfig = &vtcm_config;
|
||||
|
||||
const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config,
|
||||
&graph_dlbc_config,
|
||||
&graph_vtcm_config,
|
||||
&graph_opt_config,
|
||||
NULL};
|
||||
error = qnn_raw_interface.graphCreate(
|
||||
instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig,
|
||||
&graph_handle);
|
||||
|
@ -2428,27 +2481,33 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
|||
uint8_t * qnn_buffer_2 = nullptr;
|
||||
qnn_instance * instance = ctx->instance;
|
||||
|
||||
qnn_buffer_0 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src0), 4));
|
||||
qnn_buffer_0 = static_cast<uint8_t *>(instance->alloc_rpcmem(
|
||||
ggml_nbytes(src0), 4));
|
||||
if (nullptr == qnn_buffer_0) {
|
||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
||||
goto failure;
|
||||
} else {
|
||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
||||
}
|
||||
instance->register_rpcmem(qnn_buffer_0, tensor_0);
|
||||
memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
|
||||
|
||||
qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src1), 4));
|
||||
qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(
|
||||
ggml_nbytes(src1), 4));
|
||||
if (nullptr == qnn_buffer_1) {
|
||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
||||
goto failure;
|
||||
} else {
|
||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
||||
}
|
||||
instance->register_rpcmem(qnn_buffer_1, tensor_1);
|
||||
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
|
||||
|
||||
qnn_buffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(dst), 4));
|
||||
qnn_buffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(
|
||||
ggml_nbytes(dst), 4));
|
||||
if (nullptr == qnn_buffer_2) {
|
||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
||||
goto failure;
|
||||
} else {
|
||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
||||
}
|
||||
|
@ -2457,25 +2516,35 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
|||
|
||||
Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1};
|
||||
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
|
||||
Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1,
|
||||
.v1 = {"ggml_op_mul_mat",
|
||||
QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_MAT_MUL, 0, qnn_params, 2,
|
||||
tensor_inputs, 1, tensor_outputs}};
|
||||
Qnn_OpConfig_t op_config = {
|
||||
(Qnn_OpConfigVersion_t) 1,
|
||||
.v1 = {"ggml_op_mul_mat",
|
||||
QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_MAT_MUL,
|
||||
0, qnn_params,
|
||||
2, tensor_inputs,
|
||||
1, tensor_outputs}
|
||||
};
|
||||
error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
|
||||
if (QNN_SUCCESS != error) {
|
||||
QNN_LOG_INFO("error = %d\n", error);
|
||||
goto failure;
|
||||
}
|
||||
error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr);
|
||||
error = qnn_raw_interface.graphFinalize(graph_handle,
|
||||
nullptr, nullptr);
|
||||
if (QNN_SUCCESS != error) {
|
||||
QNN_LOG_INFO("error = %d\n", error);
|
||||
goto failure;
|
||||
}
|
||||
error = qnn_raw_interface.graphExecute(graph_handle,
|
||||
tensor_inputs, 2,
|
||||
tensor_inputs, 2,
|
||||
tensor_outputs, 1,
|
||||
nullptr, nullptr);
|
||||
if (ctx->device == QNN_BACKEND_NPU) {
|
||||
if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) {
|
||||
QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
|
||||
}
|
||||
}
|
||||
if (QNN_SUCCESS != error) {
|
||||
QNN_LOG_INFO("error = %d\n", error);
|
||||
goto failure;
|
||||
|
@ -2537,9 +2606,14 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
|||
Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1};
|
||||
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
|
||||
error = qnn_raw_interface.graphExecute(graph_handle,
|
||||
tensor_inputs, 2,
|
||||
tensor_inputs, 2,
|
||||
tensor_outputs, 1,
|
||||
nullptr, nullptr);
|
||||
if (ctx->device == QNN_BACKEND_NPU) {
|
||||
if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) {
|
||||
QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
|
||||
}
|
||||
}
|
||||
if (QNN_SUCCESS != error) {
|
||||
QNN_LOG_INFO("error = %d\n", error);
|
||||
goto failure;
|
||||
|
@ -2580,299 +2654,6 @@ failure:
|
|||
perf.info();
|
||||
}
|
||||
|
||||
// common function for GGML OPs using QNN API
|
||||
static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx,
|
||||
const enum ggml_op ggmlop,
|
||||
const ggml_tensor * src0, const ggml_tensor * src1,
|
||||
ggml_tensor * dst) {
|
||||
Qnn_ErrorHandle_t error = QNN_SUCCESS;
|
||||
bool graph_initialized = false;
|
||||
qnn_instance * instance = nullptr;
|
||||
std::string qnn_graph_name = "ggml_qnn_graph";
|
||||
std::string qnn_op_config_name = "ggml_qnn_op_config";
|
||||
const char * qnn_op_name = nullptr;
|
||||
Qnn_GraphHandle_t graph_handle = nullptr;
|
||||
Qnn_Tensor_t * tensor_0 = nullptr;
|
||||
Qnn_Tensor_t * tensor_1 = nullptr;
|
||||
Qnn_Tensor_t * tensor_2 = nullptr;
|
||||
Qnn_Param_t qnn_params[] = {};
|
||||
Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32;
|
||||
Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32;
|
||||
Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32;
|
||||
|
||||
CHECK_PARAMS(ctx, src0, src1, dst);
|
||||
tensor_0 = (Qnn_Tensor_t *) src0->extra;
|
||||
tensor_1 = (Qnn_Tensor_t *) src1->extra;
|
||||
tensor_2 = (Qnn_Tensor_t *) dst->extra;
|
||||
instance = ctx->instance;
|
||||
qnn_perf perf(ggml_op_name(ggmlop));
|
||||
perf.start();
|
||||
|
||||
qnn_op_name = qnn_opname_from_ggmlop(ggmlop);
|
||||
if (nullptr == qnn_op_name) {
|
||||
QNN_LOG_WARN("ggml op %d(%s) not supported currently", ggmlop, ggml_op_name(ggmlop));
|
||||
return;
|
||||
}
|
||||
|
||||
tensor_0 = (Qnn_Tensor_t *) src0->extra;
|
||||
tensor_1 = (Qnn_Tensor_t *) src1->extra;
|
||||
tensor_2 = (Qnn_Tensor_t *) dst->extra;
|
||||
instance = ctx->instance;
|
||||
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
|
||||
|
||||
src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type);
|
||||
src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type);
|
||||
dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type);
|
||||
|
||||
QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||
QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||
QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
|
||||
|
||||
uint32_t dimensions_input_0[] = {
|
||||
(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2],
|
||||
(uint32_t) src0->ne[3]};
|
||||
uint32_t dimensions_input_1[] = {
|
||||
(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2],
|
||||
(uint32_t) src1->ne[3]};
|
||||
uint32_t dimensions_output[] = {
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],
|
||||
(uint32_t) dst->ne[3]};
|
||||
|
||||
std::string map_entry = std::string(ggml_op_name(ggmlop));
|
||||
if (instance->_qnn_graph_map.find(map_entry) !=
|
||||
instance->_qnn_graph_map.end()) {
|
||||
graph_initialized = true;
|
||||
auto & graph_item = instance->_qnn_graph_map[map_entry];
|
||||
graph_handle = std::get<0>(graph_item);
|
||||
}
|
||||
|
||||
uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions;
|
||||
uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
|
||||
uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions;
|
||||
|
||||
if (!graph_initialized) {
|
||||
qnn_graph_name = qnn_graph_name + "_" + ggml_op_name(ggmlop) +
|
||||
std::to_string(ctx->threads) + src0->name + "_" +
|
||||
src1->name;
|
||||
qnn_op_config_name = qnn_op_config_name + "_" + ggml_op_name(ggmlop) +
|
||||
std::to_string(ctx->threads) + src0->name + "_" +
|
||||
src1->name;
|
||||
QNN_LOG_DEBUG("qnn graph name %s", qnn_graph_name.c_str());
|
||||
QNN_LOG_DEBUG("qnn op_config name %s", qnn_op_config_name.c_str());
|
||||
error = qnn_raw_interface.graphCreate(
|
||||
instance->get_qnn_context_handle(), qnn_graph_name.c_str(), nullptr,
|
||||
&graph_handle);
|
||||
if (QNN_SUCCESS != error) {
|
||||
QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph "
|
||||
"name %s, error = %d\n",
|
||||
ggml_op_name(ggmlop), qnn_graph_name.c_str(), error);
|
||||
goto failure;
|
||||
}
|
||||
|
||||
if (ctx->device == QNN_BACKEND_NPU) {
|
||||
QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
|
||||
QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0};
|
||||
|
||||
QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
|
||||
QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0};
|
||||
|
||||
QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
|
||||
QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0};
|
||||
}
|
||||
|
||||
error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0);
|
||||
if (QNN_SUCCESS != error) {
|
||||
QNN_LOG_INFO("error = %d\n", error);
|
||||
goto failure;
|
||||
}
|
||||
error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1);
|
||||
if (QNN_SUCCESS != error) {
|
||||
QNN_LOG_INFO("error = %d\n", error);
|
||||
goto failure;
|
||||
}
|
||||
error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2);
|
||||
if (QNN_SUCCESS != error) {
|
||||
QNN_LOG_INFO("error = %d\n", error);
|
||||
goto failure;
|
||||
}
|
||||
|
||||
QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
|
||||
QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0);
|
||||
QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
|
||||
QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
|
||||
QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1);
|
||||
QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
|
||||
QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output;
|
||||
QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst);
|
||||
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
|
||||
|
||||
if (ctx->device != QNN_BACKEND_NPU) {
|
||||
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data,
|
||||
qnn_get_ggml_tensor_data_size(src0)};
|
||||
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
|
||||
qnn_get_ggml_tensor_data_size(src1)};
|
||||
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
|
||||
qnn_get_ggml_tensor_data_size(dst)};
|
||||
} else {
|
||||
uint8_t * qnn_buffer_0 = nullptr;
|
||||
uint8_t * qnn_buffer_1 = nullptr;
|
||||
uint8_t * qnn_buffer_2 = nullptr;
|
||||
qnn_instance * instance = ctx->instance;
|
||||
|
||||
qnn_buffer_0 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src0), 4));
|
||||
if (nullptr == qnn_buffer_0) {
|
||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
||||
} else {
|
||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
||||
}
|
||||
instance->register_rpcmem(qnn_buffer_0, tensor_0);
|
||||
memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
|
||||
|
||||
qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src1), 4));
|
||||
if (nullptr == qnn_buffer_1) {
|
||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
||||
} else {
|
||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
||||
}
|
||||
instance->register_rpcmem(qnn_buffer_1, tensor_1);
|
||||
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
|
||||
|
||||
qnn_buffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(dst), 4));
|
||||
if (nullptr == qnn_buffer_2) {
|
||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
||||
} else {
|
||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
||||
}
|
||||
instance->register_rpcmem(qnn_buffer_2, tensor_2);
|
||||
}
|
||||
|
||||
Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1};
|
||||
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
|
||||
Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1,
|
||||
.v1 = {qnn_op_config_name.c_str(),
|
||||
QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
qnn_op_name, 0, qnn_params, 2,
|
||||
tensor_inputs, 1, tensor_outputs}};
|
||||
error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
|
||||
if (QNN_SUCCESS != error) {
|
||||
QNN_LOG_INFO("error = %d\n", error);
|
||||
goto failure;
|
||||
}
|
||||
error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr);
|
||||
if (QNN_SUCCESS != error) {
|
||||
QNN_LOG_INFO("error = %d\n", error);
|
||||
goto failure;
|
||||
}
|
||||
error = qnn_raw_interface.graphExecute(graph_handle,
|
||||
tensor_inputs, 2,
|
||||
tensor_outputs, 1,
|
||||
nullptr, nullptr);
|
||||
if (QNN_SUCCESS != error) {
|
||||
QNN_LOG_INFO("error = %d\n", error);
|
||||
goto failure;
|
||||
}
|
||||
|
||||
if (ctx->device == QNN_BACKEND_NPU) {
|
||||
uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
|
||||
QNN_VER_PTR(*tensor_2)->memHandle));
|
||||
memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
|
||||
}
|
||||
|
||||
auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2);
|
||||
instance->_qnn_graph_map[map_entry] = graph_item;
|
||||
} else {
|
||||
auto & graph_item = instance->_qnn_graph_map[map_entry];
|
||||
graph_handle = std::get<0>(graph_item);
|
||||
tensor_0 = std::get<1>(graph_item);
|
||||
tensor_1 = std::get<2>(graph_item);
|
||||
tensor_2 = std::get<3>(graph_item);
|
||||
|
||||
uint32_t dimensions_input_0[] = {
|
||||
(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
|
||||
(uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
|
||||
uint32_t dimensions_input_1[] = {
|
||||
(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
|
||||
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
|
||||
uint32_t dimensions_output[] = {
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],
|
||||
(uint32_t) dst->ne[3]};
|
||||
QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
|
||||
QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0);
|
||||
QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
|
||||
QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
|
||||
QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1);
|
||||
QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
|
||||
QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output;
|
||||
QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst);
|
||||
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
|
||||
|
||||
if (ctx->device != QNN_BACKEND_NPU) {
|
||||
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data,
|
||||
qnn_get_ggml_tensor_data_size(src0)};
|
||||
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
|
||||
qnn_get_ggml_tensor_data_size(src1)};
|
||||
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
|
||||
qnn_get_ggml_tensor_data_size(dst)};
|
||||
} else {
|
||||
uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
|
||||
QNN_VER_PTR(*tensor_0)->memHandle));
|
||||
if (nullptr != qnn_buffer_0)
|
||||
memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
|
||||
|
||||
uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
|
||||
QNN_VER_PTR(*tensor_1)->memHandle));
|
||||
if (nullptr != qnn_buffer_1)
|
||||
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
|
||||
}
|
||||
|
||||
Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1};
|
||||
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
|
||||
error = qnn_raw_interface.graphExecute(graph_handle,
|
||||
tensor_inputs, 2,
|
||||
tensor_outputs, 1,
|
||||
nullptr, nullptr);
|
||||
if (QNN_SUCCESS != error) {
|
||||
QNN_LOG_INFO("error = %d\n", error);
|
||||
goto failure;
|
||||
}
|
||||
|
||||
if (ctx->device == QNN_BACKEND_NPU) {
|
||||
uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
|
||||
QNN_VER_PTR(*tensor_2)->memHandle));
|
||||
if (nullptr != qnn_buffer_2)
|
||||
memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
|
||||
}
|
||||
}
|
||||
|
||||
failure:
|
||||
if (QNN_SUCCESS != error) {
|
||||
QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0));
|
||||
QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1));
|
||||
QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2));
|
||||
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
|
||||
" x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
||||
src0->name, src0->type, ggml_type_name(src0->type),
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0],
|
||||
src0->nb[1], src0->nb[2]);
|
||||
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
|
||||
" x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
||||
src1->name, src1->type, ggml_type_name(src1->type),
|
||||
src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0],
|
||||
src1->nb[1], src1->nb[2]);
|
||||
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
|
||||
" x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
||||
dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0],
|
||||
dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]);
|
||||
QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2],
|
||||
src0->ne[3]);
|
||||
}
|
||||
|
||||
QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions;
|
||||
QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions;
|
||||
QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions;
|
||||
perf.info();
|
||||
}
|
||||
|
||||
static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx,
|
||||
const ggml_tensor * src0, const ggml_tensor * src1,
|
||||
ggml_tensor * dst) {
|
||||
|
@ -3038,21 +2819,14 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx,
|
|||
struct ggml_compute_params * params,
|
||||
struct ggml_tensor * tensor) {
|
||||
ggml_qnn_func_t func = nullptr;
|
||||
ggml_qnn_func_common_t func_common = nullptr;
|
||||
|
||||
switch (tensor->op) {
|
||||
case GGML_OP_ADD:
|
||||
func = ggml_qnn_add;
|
||||
break;
|
||||
|
||||
case GGML_OP_MUL:
|
||||
func_common = ggml_qnn_hanlde_op;
|
||||
break;
|
||||
|
||||
case GGML_OP_MUL_MAT:
|
||||
func = ggml_qnn_mul_mat;
|
||||
break;
|
||||
|
||||
case GGML_OP_REPEAT:
|
||||
func = ggml_qnn_repeat;
|
||||
break;
|
||||
|
@ -3062,15 +2836,12 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx,
|
|||
case GGML_OP_DUP:
|
||||
func = ggml_qnn_dup;
|
||||
break;
|
||||
|
||||
case GGML_OP_ACC:
|
||||
func = ggml_qnn_acc;
|
||||
break;
|
||||
|
||||
case GGML_OP_DIV:
|
||||
func = ggml_qnn_div;
|
||||
break;
|
||||
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(tensor)) {
|
||||
case GGML_UNARY_OP_GELU:
|
||||
|
@ -3169,10 +2940,9 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx,
|
|||
return false;
|
||||
}
|
||||
|
||||
if (nullptr != func) func(ctx, tensor->src[0], tensor->src[1], tensor);
|
||||
|
||||
if (nullptr != func_common)
|
||||
func_common(ctx, tensor->op, tensor->src[0], tensor->src[1], tensor);
|
||||
if (nullptr != func) {
|
||||
func(ctx, tensor->src[0], tensor->src[1], tensor);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -3221,41 +2991,28 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t
|
|||
}
|
||||
Qnn_Tensor_t qnn_tensor = QNN_TENSOR_INIT;
|
||||
|
||||
if (ctx->device != QNN_BACKEND_GPU) {
|
||||
qnn_tensor = {
|
||||
.version = QNN_TENSOR_VERSION_1,
|
||||
{.v1 = {.id = 0,
|
||||
.name = tensor_name,
|
||||
.type = qnn_tensor_type,
|
||||
.dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
|
||||
.dataType = qnn_data_type,
|
||||
.quantizeParams =
|
||||
{QNN_DEFINITION_UNDEFINED,
|
||||
QNN_QUANTIZATION_ENCODING_UNDEFINED,
|
||||
{.scaleOffsetEncoding = {.scale = 0.0000000000000000f,
|
||||
.offset = 0}}},
|
||||
.rank = qnn_get_ggml_tensor_rank(tensor),
|
||||
.dimensions = dimensions,
|
||||
.memType = QNN_TENSORMEMTYPE_RAW,
|
||||
{.clientBuf = {.data = nullptr, .dataSize = 0}}}}};
|
||||
} else {
|
||||
qnn_tensor = {
|
||||
.version = QNN_TENSOR_VERSION_1,
|
||||
{.v1 = {.id = 0,
|
||||
.name = tensor_name,
|
||||
.type = qnn_tensor_type,
|
||||
.dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
|
||||
.dataType = qnn_data_type,
|
||||
.quantizeParams =
|
||||
{QNN_DEFINITION_UNDEFINED,
|
||||
QNN_QUANTIZATION_ENCODING_UNDEFINED,
|
||||
{.scaleOffsetEncoding = {.scale = 0.0000000000000000f,
|
||||
.offset = 0}}},
|
||||
.rank = qnn_get_ggml_tensor_rank(tensor),
|
||||
.dimensions = dimensions,
|
||||
.memType = QNN_TENSORMEMTYPE_MEMHANDLE,
|
||||
{.clientBuf = {.data = nullptr, .dataSize = 0}}}}};
|
||||
Qnn_TensorMemType_t qnn_mem_type = QNN_TENSORMEMTYPE_RAW;
|
||||
if (ctx->device == QNN_BACKEND_GPU) {
|
||||
qnn_mem_type = QNN_TENSORMEMTYPE_MEMHANDLE;
|
||||
}
|
||||
|
||||
qnn_tensor = {
|
||||
.version = QNN_TENSOR_VERSION_1,
|
||||
{.v1 = {.id = 0,
|
||||
.name = tensor_name,
|
||||
.type = qnn_tensor_type,
|
||||
.dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
|
||||
.dataType = qnn_data_type,
|
||||
.quantizeParams =
|
||||
{QNN_DEFINITION_UNDEFINED,
|
||||
QNN_QUANTIZATION_ENCODING_UNDEFINED,
|
||||
{.scaleOffsetEncoding = {.scale = 0.0000000000000000f,
|
||||
.offset = 0}}},
|
||||
.rank = qnn_get_ggml_tensor_rank(tensor),
|
||||
.dimensions = dimensions,
|
||||
.memType = qnn_mem_type,
|
||||
{.clientBuf = {.data = nullptr, .dataSize = 0}}}}};
|
||||
|
||||
Qnn_Tensor_t * p_qnn_tensor =
|
||||
(Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
|
||||
if (nullptr == p_qnn_tensor) {
|
||||
|
|
|
@ -12,8 +12,8 @@ ANDROID_PLATFORM=android-34
|
|||
|
||||
GGML_QNN_UT=ggml-qnn-ut
|
||||
REMOTE_PATH=/data/local/tmp/
|
||||
BUILDTYPE=Debug
|
||||
BUILDTYPE=Release
|
||||
BUILDTYPE=Debug
|
||||
|
||||
|
||||
function dump_vars()
|
||||
|
@ -100,7 +100,7 @@ function update_qnn_libs()
|
|||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/
|
||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/
|
||||
|
||||
#the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully
|
||||
#the QNN NPU(aka HTP) backend only verified on Qualcomm Snapdragon 8 Gen 3 equipped Android phone
|
||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/
|
||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/
|
||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/
|
||||
|
@ -142,14 +142,9 @@ function run_ggml_qnn_ut()
|
|||
|
||||
case "$ggmlop" in
|
||||
GGML_OP_ADD)
|
||||
echo "adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend"
|
||||
adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend
|
||||
;;
|
||||
|
||||
GGML_OP_MUL)
|
||||
adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL -b $qnnbackend
|
||||
;;
|
||||
|
||||
GGML_OP_MUL_MAT)
|
||||
adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL_MAT -b $qnnbackend
|
||||
;;
|
||||
|
@ -169,7 +164,6 @@ function show_usage()
|
|||
echo " $0 build (build Android command line UT program)"
|
||||
echo " $0 updateqnnlibs (upload the latest QNN libs to Android phone)"
|
||||
echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)"
|
||||
echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)"
|
||||
echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)"
|
||||
echo -e "\n\n\n"
|
||||
}
|
||||
|
|
|
@ -346,7 +346,7 @@ static void show_usage() {
|
|||
"\nUsage: test_qnn_ops [options]\n" \
|
||||
"\n" \
|
||||
"Options:\n" \
|
||||
" -t GGML_OP_ADD / GGML_OP_MUL / GGML_OP_MULMAT\n" \
|
||||
" -t GGML_OP_ADD / GGML_OP_MULMAT\n" \
|
||||
" -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(ggml)\n" \
|
||||
" ?/h print usage infomation\n\n"
|
||||
);
|
||||
|
@ -418,13 +418,9 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) {
|
|||
QNN_LOG_DEBUG("sizex: %d\n", sizex);
|
||||
QNN_LOG_DEBUG("sizey: %d\n", sizey);
|
||||
|
||||
if (n_ggml_op_type == GGML_OP_MUL) {
|
||||
src0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
|
||||
src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
|
||||
} else {
|
||||
src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
||||
src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
|
||||
}
|
||||
src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
||||
src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
|
||||
|
||||
ggml_set_input(src0);
|
||||
ggml_set_input(src1);
|
||||
|
||||
|
@ -432,9 +428,6 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) {
|
|||
case GGML_OP_ADD:
|
||||
dst = ggml_add(ctx, src0, src1);
|
||||
break;
|
||||
case GGML_OP_MUL:
|
||||
dst = ggml_mul(ctx, src0, src1);
|
||||
break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
dst = ggml_mul_mat(ctx, src0, src1);
|
||||
break;
|
||||
|
@ -518,8 +511,6 @@ int main(int argc, char * argv[]) {
|
|||
n_ggml_op_type = GGML_OP_ADD;
|
||||
} else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) {
|
||||
n_ggml_op_type = GGML_OP_MUL_MAT;
|
||||
} else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL", 11)) {
|
||||
n_ggml_op_type = GGML_OP_MUL;
|
||||
} else {
|
||||
show_usage();
|
||||
return 1;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue