review: make a MVP(Minimum Viable PR) style PR in upstream
This commit is contained in:
parent
faaa86b7e4
commit
5598fbd15d
3 changed files with 185 additions and 443 deletions
593
ggml-qnn.cpp
593
ggml-qnn.cpp
|
@ -55,7 +55,7 @@
|
||||||
#include "Saver/QnnSaver.h"
|
#include "Saver/QnnSaver.h"
|
||||||
#include "System/QnnSystemInterface.h"
|
#include "System/QnnSystemInterface.h"
|
||||||
#include "HTP/QnnHtpDevice.h"
|
#include "HTP/QnnHtpDevice.h"
|
||||||
#include <HTP/QnnHtpGraph.h>
|
#include "HTP/QnnHtpGraph.h"
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
//
|
//
|
||||||
|
@ -91,12 +91,6 @@ typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx,
|
||||||
const ggml_tensor * src1,
|
const ggml_tensor * src1,
|
||||||
ggml_tensor * dst);
|
ggml_tensor * dst);
|
||||||
|
|
||||||
typedef void (*ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx,
|
|
||||||
const ggml_op ggml_op,
|
|
||||||
const ggml_tensor * src0,
|
|
||||||
const ggml_tensor * src1,
|
|
||||||
ggml_tensor * dst);
|
|
||||||
|
|
||||||
enum qcom_htp_arch {
|
enum qcom_htp_arch {
|
||||||
NONE = 0,
|
NONE = 0,
|
||||||
V68 = 68,
|
V68 = 68,
|
||||||
|
@ -424,6 +418,7 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef NDEBUG
|
||||||
#define CHECK_PARAMS(ctx, src0, src1, dst) \
|
#define CHECK_PARAMS(ctx, src0, src1, dst) \
|
||||||
do { \
|
do { \
|
||||||
if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \
|
if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \
|
||||||
|
@ -431,6 +426,10 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
#else
|
||||||
|
#define CHECK_PARAMS(ctx, src0, src1, dst)
|
||||||
|
#endif
|
||||||
|
|
||||||
#if ENABLE_QNNBACKEND_PERF
|
#if ENABLE_QNNBACKEND_PERF
|
||||||
class qnn_perf {
|
class qnn_perf {
|
||||||
public:
|
public:
|
||||||
|
@ -446,7 +445,7 @@ public:
|
||||||
void info() {
|
void info() {
|
||||||
_end_time = ggml_time_us();
|
_end_time = ggml_time_us();
|
||||||
_duration = (_end_time - _begin_time);
|
_duration = (_end_time - _begin_time);
|
||||||
QNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
|
QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -809,7 +808,7 @@ static void qnn_sdk_logcallback(const char * fmt, QnnLog_Level_t level,
|
||||||
|
|
||||||
memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN);
|
memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN);
|
||||||
vsnprintf(reinterpret_cast<char *const>(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp);
|
vsnprintf(reinterpret_cast<char *const>(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp);
|
||||||
QNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf);
|
QNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
@ -1069,7 +1068,7 @@ class qnn_instance {
|
||||||
arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
|
arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
|
||||||
arch_devconfig.customConfig = &arch_customconfig;
|
arch_devconfig.customConfig = &arch_customconfig;
|
||||||
|
|
||||||
const QnnDevice_Config_t * p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, NULL};
|
const QnnDevice_Config_t * p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, nullptr};
|
||||||
qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle);
|
qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle);
|
||||||
} else {
|
} else {
|
||||||
qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle);
|
qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle);
|
||||||
|
@ -1137,10 +1136,14 @@ class qnn_instance {
|
||||||
_pfn_rpc_mem_init();
|
_pfn_rpc_mem_init();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<const QnnContext_Config_t *> temp_context_config;
|
/* TODO: not used, keep it for further usage
|
||||||
|
QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT;
|
||||||
|
qnn_context_config.priority = QNN_PRIORITY_DEFAULT;
|
||||||
|
const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr};
|
||||||
|
*/
|
||||||
_qnn_interface.qnn_context_create(
|
_qnn_interface.qnn_context_create(
|
||||||
_qnn_backend_handle, _qnn_device_handle,
|
_qnn_backend_handle, _qnn_device_handle,
|
||||||
temp_context_config.empty() ? nullptr : temp_context_config.data(),
|
nullptr,
|
||||||
&_qnn_context_handle);
|
&_qnn_context_handle);
|
||||||
if (nullptr == _qnn_context_handle) {
|
if (nullptr == _qnn_context_handle) {
|
||||||
QNN_LOG_WARN("why failed to initialize qnn context\n");
|
QNN_LOG_WARN("why failed to initialize qnn context\n");
|
||||||
|
@ -1157,9 +1160,11 @@ class qnn_instance {
|
||||||
size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048};
|
size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048};
|
||||||
size_t probe_counts = sizeof(probe_slots) / sizeof(size_t);
|
size_t probe_counts = sizeof(probe_slots) / sizeof(size_t);
|
||||||
for (size_t idx = 0; idx < probe_counts; idx++) {
|
for (size_t idx = 0; idx < probe_counts; idx++) {
|
||||||
rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem(probe_slots[idx] * size_in_mb, 4));
|
rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem(
|
||||||
|
probe_slots[idx] * size_in_mb, 4));
|
||||||
if (nullptr == rpc_buffer) {
|
if (nullptr == rpc_buffer) {
|
||||||
QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
|
QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n",
|
||||||
|
probe_slots[idx], strerror(errno));
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
candidate_size = probe_slots[idx];
|
candidate_size = probe_slots[idx];
|
||||||
|
@ -1262,8 +1267,8 @@ class qnn_instance {
|
||||||
return ret_status;
|
return ret_status;
|
||||||
}
|
}
|
||||||
|
|
||||||
//keep it for further usage of offload the entire cgraph to a single QNN DAG directly
|
//TODO:keep it for further usage of offload the entire cgraph to a single QNN DAG directly
|
||||||
//which was used in Qualcomm's dedicated AI technology
|
// which was used in Qualcomm's dedicated AI technology
|
||||||
#if 0
|
#if 0
|
||||||
int init_qnn_graph(const char * graph_name, bool debug,
|
int init_qnn_graph(const char * graph_name, bool debug,
|
||||||
uint8_t do_node_validation = true,
|
uint8_t do_node_validation = true,
|
||||||
|
@ -1430,13 +1435,14 @@ class qnn_instance {
|
||||||
QnnHtpPerfInfrastructure_PowerConfig_t power_config;
|
QnnHtpPerfInfrastructure_PowerConfig_t power_config;
|
||||||
memset(&power_config, 0, sizeof(power_config));
|
memset(&power_config, 0, sizeof(power_config));
|
||||||
power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
|
power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
|
||||||
power_config.dcvsV3Config.dcvsEnable = 0;
|
|
||||||
power_config.dcvsV3Config.setDcvsEnable = 1;
|
power_config.dcvsV3Config.setDcvsEnable = 1;
|
||||||
|
power_config.dcvsV3Config.dcvsEnable = 0;
|
||||||
power_config.dcvsV3Config.contextId = _qnn_power_configid;
|
power_config.dcvsV3Config.contextId = _qnn_power_configid;
|
||||||
power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
|
power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
|
||||||
power_config.dcvsV3Config.setSleepLatency =
|
power_config.dcvsV3Config.setSleepLatency =
|
||||||
1; // true to consider Latency parameter otherwise false
|
1; // true to consider Latency parameter otherwise false
|
||||||
power_config.dcvsV3Config.sleepLatency = 10;
|
power_config.dcvsV3Config.sleepLatency = 40;
|
||||||
power_config.dcvsV3Config.setBusParams =
|
power_config.dcvsV3Config.setBusParams =
|
||||||
1; // true to consider Bus parameter otherwise false
|
1; // true to consider Bus parameter otherwise false
|
||||||
power_config.dcvsV3Config.setCoreParams =
|
power_config.dcvsV3Config.setCoreParams =
|
||||||
|
@ -1459,6 +1465,7 @@ class qnn_instance {
|
||||||
DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
|
DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
|
||||||
power_config.dcvsV3Config.coreVoltageCornerMax =
|
power_config.dcvsV3Config.coreVoltageCornerMax =
|
||||||
DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
|
DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
|
||||||
|
|
||||||
// set power config with different performance parameters
|
// set power config with different performance parameters
|
||||||
const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {
|
const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {
|
||||||
&power_config, nullptr};
|
&power_config, nullptr};
|
||||||
|
@ -1550,6 +1557,7 @@ class qnn_instance {
|
||||||
QNN_LOG_WARN("rpc memory already allocated\n");
|
QNN_LOG_WARN("rpc memory already allocated\n");
|
||||||
return 3;
|
return 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) {
|
if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) {
|
||||||
QNN_LOG_WARN("tensor %s has been registered shared memory\n",
|
QNN_LOG_WARN("tensor %s has been registered shared memory\n",
|
||||||
(QNN_VER_PTR(*p_tensor)->name));
|
(QNN_VER_PTR(*p_tensor)->name));
|
||||||
|
@ -1710,7 +1718,7 @@ class qnn_instance {
|
||||||
int result = 0;
|
int result = 0;
|
||||||
|
|
||||||
if (nullptr == _system_lib_handle) {
|
if (nullptr == _system_lib_handle) {
|
||||||
QNN_LOG_DEBUG("system lib handle is null\n");
|
QNN_LOG_WARN("system lib handle is null\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1724,8 +1732,7 @@ class qnn_instance {
|
||||||
|
|
||||||
int dlclose_error = dlclose(_system_lib_handle);
|
int dlclose_error = dlclose(_system_lib_handle);
|
||||||
if (dlclose_error != 0) {
|
if (dlclose_error != 0) {
|
||||||
QNN_LOG_WARN("failed to close QnnSystem library, error %s\n",
|
QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror());
|
||||||
dlerror());
|
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1740,8 +1747,7 @@ class qnn_instance {
|
||||||
|
|
||||||
void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
|
void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
|
||||||
if (nullptr == lib_handle) {
|
if (nullptr == lib_handle) {
|
||||||
QNN_LOG_WARN("can not open QNN library %s, with error: %s",
|
QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror());
|
||||||
lib_path.c_str(), dlerror());
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1749,8 +1755,7 @@ class qnn_instance {
|
||||||
load_qnn_functionpointers<pfn_qnninterface_getproviders *>(
|
load_qnn_functionpointers<pfn_qnninterface_getproviders *>(
|
||||||
lib_handle, "QnnInterface_getProviders");
|
lib_handle, "QnnInterface_getProviders");
|
||||||
if (nullptr == get_providers) {
|
if (nullptr == get_providers) {
|
||||||
QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s",
|
QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror());
|
||||||
dlerror());
|
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1758,14 +1763,12 @@ class qnn_instance {
|
||||||
const QnnInterface_t ** provider_list = nullptr;
|
const QnnInterface_t ** provider_list = nullptr;
|
||||||
error = get_providers(&provider_list, &num_providers);
|
error = get_providers(&provider_list, &num_providers);
|
||||||
if (error != QNN_SUCCESS) {
|
if (error != QNN_SUCCESS) {
|
||||||
QNN_LOG_WARN("failed to get providers, error %d",
|
QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error));
|
||||||
QNN_GET_ERROR_CODE(error));
|
|
||||||
return 3;
|
return 3;
|
||||||
}
|
}
|
||||||
QNN_LOG_DEBUG("num_providers=%d\n", num_providers);
|
QNN_LOG_DEBUG("num_providers=%d\n", num_providers);
|
||||||
if (num_providers != _required_num_providers) {
|
if (num_providers != _required_num_providers) {
|
||||||
QNN_LOG_WARN("providers is %d instead of required %d", num_providers,
|
QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers);
|
||||||
_required_num_providers);
|
|
||||||
return 4;
|
return 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1797,16 +1800,14 @@ class qnn_instance {
|
||||||
BackendIdType backend_id = provider_list[0]->backendId;
|
BackendIdType backend_id = provider_list[0]->backendId;
|
||||||
_lib_path_to_backend_id[lib_path] = backend_id;
|
_lib_path_to_backend_id[lib_path] = backend_id;
|
||||||
if (_loaded_backend.count(backend_id) > 0) {
|
if (_loaded_backend.count(backend_id) > 0) {
|
||||||
QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n",
|
QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id);
|
||||||
lib_path.c_str(), backend_id);
|
|
||||||
}
|
}
|
||||||
_loaded_backend[backend_id] = provider_list[0];
|
_loaded_backend[backend_id] = provider_list[0];
|
||||||
if (_loaded_lib_handle.count(backend_id) > 0) {
|
if (_loaded_lib_handle.count(backend_id) > 0) {
|
||||||
QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]);
|
QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]);
|
||||||
int dlclose_error = dlclose(_loaded_lib_handle[backend_id]);
|
int dlclose_error = dlclose(_loaded_lib_handle[backend_id]);
|
||||||
if (dlclose_error != 0) {
|
if (dlclose_error != 0) {
|
||||||
QNN_LOG_WARN("fail to close %p with error %s\n",
|
QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror());
|
||||||
_loaded_lib_handle[backend_id], dlerror());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_loaded_lib_handle[backend_id] = lib_handle;
|
_loaded_lib_handle[backend_id] = lib_handle;
|
||||||
|
@ -1820,8 +1821,7 @@ class qnn_instance {
|
||||||
for (auto & it : _loaded_lib_handle) {
|
for (auto & it : _loaded_lib_handle) {
|
||||||
dlclose_error = dlclose(it.second);
|
dlclose_error = dlclose(it.second);
|
||||||
if (dlclose_error != 0) {
|
if (dlclose_error != 0) {
|
||||||
QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first,
|
QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror());
|
||||||
dlerror());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1924,7 +1924,6 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx,
|
||||||
const int64_t ne01 = src0->ne[1];
|
const int64_t ne01 = src0->ne[1];
|
||||||
const int64_t ne10 = src1->ne[0];
|
const int64_t ne10 = src1->ne[0];
|
||||||
const int64_t ne11 = src1->ne[1];
|
const int64_t ne11 = src1->ne[1];
|
||||||
|
|
||||||
// make qnn_get_ggml_tensor_rank and QNN SDK happy
|
// make qnn_get_ggml_tensor_rank and QNN SDK happy
|
||||||
if (ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1) {
|
if (ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -1932,13 +1931,13 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx,
|
||||||
|
|
||||||
// TODO: support other GGML OPs using QNN API
|
// TODO: support other GGML OPs using QNN API
|
||||||
// a GENERAL approach could fix this problem in a standalone PR of refine ggml backend
|
// a GENERAL approach could fix this problem in a standalone PR of refine ggml backend
|
||||||
// subsystem for mixed inference between CPU&GPU / CPU&NPU easily for ANY ggml backends
|
// subsystem for hybrid inference between CPU&GPU / CPU&NPU easily(less the 100 LoC and no
|
||||||
// which the backend's ggml_backend_xxx_buffer_is_host return true.
|
// side-effect to the existing codes) for ANY ggml backends which the backend's
|
||||||
// this approach could be found:
|
// ggml_backend_xxx_buffer_is_host return true. this approach could be found at:
|
||||||
// https://github.com/ggerganov/llama.cpp/pull/7641
|
// https://github.com/ggerganov/llama.cpp/pull/7641
|
||||||
bool supported_op = false;
|
bool supported_op = false;
|
||||||
supported_op = (tensor->op == GGML_OP_ADD);
|
supported_op = (tensor->op == GGML_OP_ADD);
|
||||||
supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT));
|
supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL_MAT));
|
||||||
if (!supported_op) {
|
if (!supported_op) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -1950,14 +1949,9 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int qtype = src0->type;
|
|
||||||
if (tensor->op == GGML_OP_MUL) {
|
|
||||||
return (qtype == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (tensor->op == GGML_OP_MUL_MAT) {
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
||||||
if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) {
|
if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) {
|
||||||
//make mul_mat with QNN RPC happy
|
//comment it for make UT of mul_mat with QNN RPC happy
|
||||||
//return false;
|
//return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1965,6 +1959,8 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat
|
||||||
|
// keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC
|
||||||
static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
|
static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
|
||||||
const ggml_tensor * src1, ggml_tensor * dst) {
|
const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
Qnn_ErrorHandle_t error = QNN_SUCCESS;
|
Qnn_ErrorHandle_t error = QNN_SUCCESS;
|
||||||
|
@ -1986,10 +1982,11 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
|
||||||
tensor_1 = (Qnn_Tensor_t *) src1->extra;
|
tensor_1 = (Qnn_Tensor_t *) src1->extra;
|
||||||
tensor_2 = (Qnn_Tensor_t *) dst->extra;
|
tensor_2 = (Qnn_Tensor_t *) dst->extra;
|
||||||
instance = ctx->instance;
|
instance = ctx->instance;
|
||||||
|
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
|
||||||
|
|
||||||
qnn_perf perf("ggml_qnn_add");
|
qnn_perf perf("ggml_qnn_add");
|
||||||
perf.start();
|
perf.start();
|
||||||
|
|
||||||
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
|
|
||||||
QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||||
QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||||
QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
|
QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
|
||||||
|
@ -2034,17 +2031,31 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
|
||||||
|
|
||||||
QnnHtpGraph_CustomConfig_t dlbc_config;
|
QnnHtpGraph_CustomConfig_t dlbc_config;
|
||||||
dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
|
dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
|
||||||
/*
|
|
||||||
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
|
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
|
||||||
dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC
|
dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
|
||||||
*/
|
|
||||||
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
|
|
||||||
|
|
||||||
QnnGraph_Config_t graph_dlbc_config;
|
QnnGraph_Config_t graph_dlbc_config;
|
||||||
graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||||
graph_dlbc_config.customConfig = &dlbc_config;
|
graph_dlbc_config.customConfig = &dlbc_config;
|
||||||
|
|
||||||
const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL};
|
QnnHtpGraph_CustomConfig_t opt_config;
|
||||||
|
opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
|
||||||
|
opt_config.optimizationOption.floatValue = 1; // 1 / 3
|
||||||
|
QnnGraph_Config_t graph_opt_config;
|
||||||
|
graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||||
|
graph_opt_config.customConfig = &opt_config;
|
||||||
|
|
||||||
|
QnnHtpGraph_CustomConfig_t vtcm_config;
|
||||||
|
vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
|
||||||
|
vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb;
|
||||||
|
QnnGraph_Config_t graph_vtcm_config;
|
||||||
|
graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||||
|
graph_vtcm_config.customConfig = &vtcm_config;
|
||||||
|
|
||||||
|
const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config,
|
||||||
|
&graph_dlbc_config,
|
||||||
|
&graph_vtcm_config,
|
||||||
|
&graph_opt_config,
|
||||||
|
NULL};
|
||||||
error = qnn_raw_interface.graphCreate(
|
error = qnn_raw_interface.graphCreate(
|
||||||
instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig,
|
instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig,
|
||||||
&graph_handle);
|
&graph_handle);
|
||||||
|
@ -2113,27 +2124,33 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
|
||||||
uint8_t * qnn_buffer_2 = nullptr;
|
uint8_t * qnn_buffer_2 = nullptr;
|
||||||
qnn_instance * instance = ctx->instance;
|
qnn_instance * instance = ctx->instance;
|
||||||
|
|
||||||
qnn_buffer_0 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src0), 4));
|
qnn_buffer_0 = static_cast<uint8_t *>(instance->alloc_rpcmem(
|
||||||
|
ggml_nbytes(src0), 4));
|
||||||
if (nullptr == qnn_buffer_0) {
|
if (nullptr == qnn_buffer_0) {
|
||||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
||||||
|
goto failure;
|
||||||
} else {
|
} else {
|
||||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
||||||
}
|
}
|
||||||
instance->register_rpcmem(qnn_buffer_0, tensor_0);
|
instance->register_rpcmem(qnn_buffer_0, tensor_0);
|
||||||
memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
|
memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
|
||||||
|
|
||||||
qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src1), 4));
|
qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(
|
||||||
|
ggml_nbytes(src1), 4));
|
||||||
if (nullptr == qnn_buffer_1) {
|
if (nullptr == qnn_buffer_1) {
|
||||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
||||||
|
goto failure;
|
||||||
} else {
|
} else {
|
||||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
||||||
}
|
}
|
||||||
instance->register_rpcmem(qnn_buffer_1, tensor_1);
|
instance->register_rpcmem(qnn_buffer_1, tensor_1);
|
||||||
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
|
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
|
||||||
|
|
||||||
qnn_buffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(dst), 4));
|
qnn_buffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(
|
||||||
|
ggml_nbytes(dst), 4));
|
||||||
if (nullptr == qnn_buffer_2) {
|
if (nullptr == qnn_buffer_2) {
|
||||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
||||||
|
goto failure;
|
||||||
} else {
|
} else {
|
||||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
||||||
}
|
}
|
||||||
|
@ -2144,23 +2161,33 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
|
||||||
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
|
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
|
||||||
Qnn_OpConfig_t op_config = {
|
Qnn_OpConfig_t op_config = {
|
||||||
(Qnn_OpConfigVersion_t) 1,
|
(Qnn_OpConfigVersion_t) 1,
|
||||||
.v1 = {"ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW,
|
.v1 = {"ggml_op_add",
|
||||||
QNN_OP_ELEMENT_WISE_ADD, 0, qnn_params,
|
QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||||
2, tensor_inputs, 1,
|
QNN_OP_ELEMENT_WISE_ADD,
|
||||||
tensor_outputs}};
|
0, qnn_params,
|
||||||
|
2, tensor_inputs,
|
||||||
|
1,tensor_outputs}
|
||||||
|
};
|
||||||
error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
|
error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
|
||||||
if (QNN_SUCCESS != error) {
|
if (QNN_SUCCESS != error) {
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
QNN_LOG_INFO("error = %d\n", error);
|
||||||
goto failure;
|
goto failure;
|
||||||
}
|
}
|
||||||
error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr);
|
error = qnn_raw_interface.graphFinalize(graph_handle,
|
||||||
|
nullptr, nullptr);
|
||||||
if (QNN_SUCCESS != error) {
|
if (QNN_SUCCESS != error) {
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
QNN_LOG_INFO("error = %d\n", error);
|
||||||
goto failure;
|
goto failure;
|
||||||
}
|
}
|
||||||
error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2,
|
error = qnn_raw_interface.graphExecute(graph_handle,
|
||||||
|
tensor_inputs, 2,
|
||||||
tensor_outputs, 1,
|
tensor_outputs, 1,
|
||||||
nullptr, nullptr);
|
nullptr, nullptr);
|
||||||
|
if (ctx->device == QNN_BACKEND_NPU) {
|
||||||
|
if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) {
|
||||||
|
QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
if (QNN_SUCCESS != error) {
|
if (QNN_SUCCESS != error) {
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
QNN_LOG_INFO("error = %d\n", error);
|
||||||
goto failure;
|
goto failure;
|
||||||
|
@ -2221,9 +2248,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
|
||||||
|
|
||||||
Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1};
|
Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1};
|
||||||
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
|
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
|
||||||
error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2,
|
error = qnn_raw_interface.graphExecute(graph_handle,
|
||||||
|
tensor_inputs,2,
|
||||||
tensor_outputs,1,
|
tensor_outputs,1,
|
||||||
nullptr, nullptr);
|
nullptr, nullptr);
|
||||||
|
if (ctx->device == QNN_BACKEND_NPU) {
|
||||||
|
if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) {
|
||||||
|
QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
if (QNN_SUCCESS != error) {
|
if (QNN_SUCCESS != error) {
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
QNN_LOG_INFO("error = %d\n", error);
|
||||||
goto failure;
|
goto failure;
|
||||||
|
@ -2299,6 +2332,8 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
||||||
tensor_1 = (Qnn_Tensor_t *) src1->extra;
|
tensor_1 = (Qnn_Tensor_t *) src1->extra;
|
||||||
tensor_2 = (Qnn_Tensor_t *) dst->extra;
|
tensor_2 = (Qnn_Tensor_t *) dst->extra;
|
||||||
instance = ctx->instance;
|
instance = ctx->instance;
|
||||||
|
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
|
||||||
|
|
||||||
qnn_perf perf("ggml_qnn_mul_mat");
|
qnn_perf perf("ggml_qnn_mul_mat");
|
||||||
perf.start();
|
perf.start();
|
||||||
|
|
||||||
|
@ -2307,7 +2342,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
||||||
tensor_2 = (Qnn_Tensor_t *) dst->extra;
|
tensor_2 = (Qnn_Tensor_t *) dst->extra;
|
||||||
instance = ctx->instance;
|
instance = ctx->instance;
|
||||||
|
|
||||||
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
|
|
||||||
QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||||
QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||||
QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
|
QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
|
||||||
|
@ -2338,6 +2372,11 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
||||||
uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
|
uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
|
||||||
uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions;
|
uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions;
|
||||||
|
|
||||||
|
//TODO: for scenarios of quantized data in src0
|
||||||
|
// pass-1: dequantize src0 to FP32
|
||||||
|
// pass-2: dq-src0 * src1
|
||||||
|
// the performance gains is worth although there is performance loss in pass-1
|
||||||
|
|
||||||
if (!graph_initialized) {
|
if (!graph_initialized) {
|
||||||
graph_name = graph_name + "_" + std::to_string(ctx->threads) +
|
graph_name = graph_name + "_" + std::to_string(ctx->threads) +
|
||||||
"_" + src0->name + "_" + src1->name;
|
"_" + src0->name + "_" + src1->name;
|
||||||
|
@ -2352,17 +2391,31 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
||||||
|
|
||||||
QnnHtpGraph_CustomConfig_t dlbc_config;
|
QnnHtpGraph_CustomConfig_t dlbc_config;
|
||||||
dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
|
dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
|
||||||
/*
|
|
||||||
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
|
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
|
||||||
dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC
|
dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
|
||||||
*/
|
|
||||||
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
|
|
||||||
|
|
||||||
QnnGraph_Config_t graph_dlbc_config;
|
QnnGraph_Config_t graph_dlbc_config;
|
||||||
graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||||
graph_dlbc_config.customConfig = &dlbc_config;
|
graph_dlbc_config.customConfig = &dlbc_config;
|
||||||
|
|
||||||
const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL};
|
QnnHtpGraph_CustomConfig_t opt_config;
|
||||||
|
opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
|
||||||
|
opt_config.optimizationOption.floatValue = 1; //1 / 3
|
||||||
|
QnnGraph_Config_t graph_opt_config;
|
||||||
|
graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||||
|
graph_opt_config.customConfig = &opt_config;
|
||||||
|
|
||||||
|
QnnHtpGraph_CustomConfig_t vtcm_config;
|
||||||
|
vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
|
||||||
|
vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb;
|
||||||
|
QnnGraph_Config_t graph_vtcm_config;
|
||||||
|
graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||||
|
graph_vtcm_config.customConfig = &vtcm_config;
|
||||||
|
|
||||||
|
const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config,
|
||||||
|
&graph_dlbc_config,
|
||||||
|
&graph_vtcm_config,
|
||||||
|
&graph_opt_config,
|
||||||
|
NULL};
|
||||||
error = qnn_raw_interface.graphCreate(
|
error = qnn_raw_interface.graphCreate(
|
||||||
instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig,
|
instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig,
|
||||||
&graph_handle);
|
&graph_handle);
|
||||||
|
@ -2428,27 +2481,33 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
||||||
uint8_t * qnn_buffer_2 = nullptr;
|
uint8_t * qnn_buffer_2 = nullptr;
|
||||||
qnn_instance * instance = ctx->instance;
|
qnn_instance * instance = ctx->instance;
|
||||||
|
|
||||||
qnn_buffer_0 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src0), 4));
|
qnn_buffer_0 = static_cast<uint8_t *>(instance->alloc_rpcmem(
|
||||||
|
ggml_nbytes(src0), 4));
|
||||||
if (nullptr == qnn_buffer_0) {
|
if (nullptr == qnn_buffer_0) {
|
||||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
||||||
|
goto failure;
|
||||||
} else {
|
} else {
|
||||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
||||||
}
|
}
|
||||||
instance->register_rpcmem(qnn_buffer_0, tensor_0);
|
instance->register_rpcmem(qnn_buffer_0, tensor_0);
|
||||||
memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
|
memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
|
||||||
|
|
||||||
qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src1), 4));
|
qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(
|
||||||
|
ggml_nbytes(src1), 4));
|
||||||
if (nullptr == qnn_buffer_1) {
|
if (nullptr == qnn_buffer_1) {
|
||||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
||||||
|
goto failure;
|
||||||
} else {
|
} else {
|
||||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
||||||
}
|
}
|
||||||
instance->register_rpcmem(qnn_buffer_1, tensor_1);
|
instance->register_rpcmem(qnn_buffer_1, tensor_1);
|
||||||
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
|
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
|
||||||
|
|
||||||
qnn_buffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(dst), 4));
|
qnn_buffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(
|
||||||
|
ggml_nbytes(dst), 4));
|
||||||
if (nullptr == qnn_buffer_2) {
|
if (nullptr == qnn_buffer_2) {
|
||||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
||||||
|
goto failure;
|
||||||
} else {
|
} else {
|
||||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
||||||
}
|
}
|
||||||
|
@ -2457,17 +2516,22 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
||||||
|
|
||||||
Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1};
|
Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1};
|
||||||
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
|
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
|
||||||
Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1,
|
Qnn_OpConfig_t op_config = {
|
||||||
|
(Qnn_OpConfigVersion_t) 1,
|
||||||
.v1 = {"ggml_op_mul_mat",
|
.v1 = {"ggml_op_mul_mat",
|
||||||
QNN_OP_PACKAGE_NAME_QTI_AISW,
|
QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||||
QNN_OP_MAT_MUL, 0, qnn_params, 2,
|
QNN_OP_MAT_MUL,
|
||||||
tensor_inputs, 1, tensor_outputs}};
|
0, qnn_params,
|
||||||
|
2, tensor_inputs,
|
||||||
|
1, tensor_outputs}
|
||||||
|
};
|
||||||
error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
|
error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
|
||||||
if (QNN_SUCCESS != error) {
|
if (QNN_SUCCESS != error) {
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
QNN_LOG_INFO("error = %d\n", error);
|
||||||
goto failure;
|
goto failure;
|
||||||
}
|
}
|
||||||
error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr);
|
error = qnn_raw_interface.graphFinalize(graph_handle,
|
||||||
|
nullptr, nullptr);
|
||||||
if (QNN_SUCCESS != error) {
|
if (QNN_SUCCESS != error) {
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
QNN_LOG_INFO("error = %d\n", error);
|
||||||
goto failure;
|
goto failure;
|
||||||
|
@ -2476,6 +2540,11 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
||||||
tensor_inputs, 2,
|
tensor_inputs, 2,
|
||||||
tensor_outputs, 1,
|
tensor_outputs, 1,
|
||||||
nullptr, nullptr);
|
nullptr, nullptr);
|
||||||
|
if (ctx->device == QNN_BACKEND_NPU) {
|
||||||
|
if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) {
|
||||||
|
QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
if (QNN_SUCCESS != error) {
|
if (QNN_SUCCESS != error) {
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
QNN_LOG_INFO("error = %d\n", error);
|
||||||
goto failure;
|
goto failure;
|
||||||
|
@ -2540,6 +2609,11 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
||||||
tensor_inputs, 2,
|
tensor_inputs, 2,
|
||||||
tensor_outputs, 1,
|
tensor_outputs, 1,
|
||||||
nullptr, nullptr);
|
nullptr, nullptr);
|
||||||
|
if (ctx->device == QNN_BACKEND_NPU) {
|
||||||
|
if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) {
|
||||||
|
QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
if (QNN_SUCCESS != error) {
|
if (QNN_SUCCESS != error) {
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
QNN_LOG_INFO("error = %d\n", error);
|
||||||
goto failure;
|
goto failure;
|
||||||
|
@ -2580,299 +2654,6 @@ failure:
|
||||||
perf.info();
|
perf.info();
|
||||||
}
|
}
|
||||||
|
|
||||||
// common function for GGML OPs using QNN API
|
|
||||||
static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx,
|
|
||||||
const enum ggml_op ggmlop,
|
|
||||||
const ggml_tensor * src0, const ggml_tensor * src1,
|
|
||||||
ggml_tensor * dst) {
|
|
||||||
Qnn_ErrorHandle_t error = QNN_SUCCESS;
|
|
||||||
bool graph_initialized = false;
|
|
||||||
qnn_instance * instance = nullptr;
|
|
||||||
std::string qnn_graph_name = "ggml_qnn_graph";
|
|
||||||
std::string qnn_op_config_name = "ggml_qnn_op_config";
|
|
||||||
const char * qnn_op_name = nullptr;
|
|
||||||
Qnn_GraphHandle_t graph_handle = nullptr;
|
|
||||||
Qnn_Tensor_t * tensor_0 = nullptr;
|
|
||||||
Qnn_Tensor_t * tensor_1 = nullptr;
|
|
||||||
Qnn_Tensor_t * tensor_2 = nullptr;
|
|
||||||
Qnn_Param_t qnn_params[] = {};
|
|
||||||
Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32;
|
|
||||||
Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32;
|
|
||||||
Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32;
|
|
||||||
|
|
||||||
CHECK_PARAMS(ctx, src0, src1, dst);
|
|
||||||
tensor_0 = (Qnn_Tensor_t *) src0->extra;
|
|
||||||
tensor_1 = (Qnn_Tensor_t *) src1->extra;
|
|
||||||
tensor_2 = (Qnn_Tensor_t *) dst->extra;
|
|
||||||
instance = ctx->instance;
|
|
||||||
qnn_perf perf(ggml_op_name(ggmlop));
|
|
||||||
perf.start();
|
|
||||||
|
|
||||||
qnn_op_name = qnn_opname_from_ggmlop(ggmlop);
|
|
||||||
if (nullptr == qnn_op_name) {
|
|
||||||
QNN_LOG_WARN("ggml op %d(%s) not supported currently", ggmlop, ggml_op_name(ggmlop));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
tensor_0 = (Qnn_Tensor_t *) src0->extra;
|
|
||||||
tensor_1 = (Qnn_Tensor_t *) src1->extra;
|
|
||||||
tensor_2 = (Qnn_Tensor_t *) dst->extra;
|
|
||||||
instance = ctx->instance;
|
|
||||||
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
|
|
||||||
|
|
||||||
src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type);
|
|
||||||
src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type);
|
|
||||||
dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type);
|
|
||||||
|
|
||||||
QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
|
||||||
QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
|
||||||
QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
|
|
||||||
|
|
||||||
uint32_t dimensions_input_0[] = {
|
|
||||||
(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2],
|
|
||||||
(uint32_t) src0->ne[3]};
|
|
||||||
uint32_t dimensions_input_1[] = {
|
|
||||||
(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2],
|
|
||||||
(uint32_t) src1->ne[3]};
|
|
||||||
uint32_t dimensions_output[] = {
|
|
||||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],
|
|
||||||
(uint32_t) dst->ne[3]};
|
|
||||||
|
|
||||||
std::string map_entry = std::string(ggml_op_name(ggmlop));
|
|
||||||
if (instance->_qnn_graph_map.find(map_entry) !=
|
|
||||||
instance->_qnn_graph_map.end()) {
|
|
||||||
graph_initialized = true;
|
|
||||||
auto & graph_item = instance->_qnn_graph_map[map_entry];
|
|
||||||
graph_handle = std::get<0>(graph_item);
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions;
|
|
||||||
uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
|
|
||||||
uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions;
|
|
||||||
|
|
||||||
if (!graph_initialized) {
|
|
||||||
qnn_graph_name = qnn_graph_name + "_" + ggml_op_name(ggmlop) +
|
|
||||||
std::to_string(ctx->threads) + src0->name + "_" +
|
|
||||||
src1->name;
|
|
||||||
qnn_op_config_name = qnn_op_config_name + "_" + ggml_op_name(ggmlop) +
|
|
||||||
std::to_string(ctx->threads) + src0->name + "_" +
|
|
||||||
src1->name;
|
|
||||||
QNN_LOG_DEBUG("qnn graph name %s", qnn_graph_name.c_str());
|
|
||||||
QNN_LOG_DEBUG("qnn op_config name %s", qnn_op_config_name.c_str());
|
|
||||||
error = qnn_raw_interface.graphCreate(
|
|
||||||
instance->get_qnn_context_handle(), qnn_graph_name.c_str(), nullptr,
|
|
||||||
&graph_handle);
|
|
||||||
if (QNN_SUCCESS != error) {
|
|
||||||
QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph "
|
|
||||||
"name %s, error = %d\n",
|
|
||||||
ggml_op_name(ggmlop), qnn_graph_name.c_str(), error);
|
|
||||||
goto failure;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ctx->device == QNN_BACKEND_NPU) {
|
|
||||||
QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
|
|
||||||
QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0};
|
|
||||||
|
|
||||||
QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
|
|
||||||
QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0};
|
|
||||||
|
|
||||||
QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
|
|
||||||
QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0};
|
|
||||||
}
|
|
||||||
|
|
||||||
error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0);
|
|
||||||
if (QNN_SUCCESS != error) {
|
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
|
||||||
goto failure;
|
|
||||||
}
|
|
||||||
error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1);
|
|
||||||
if (QNN_SUCCESS != error) {
|
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
|
||||||
goto failure;
|
|
||||||
}
|
|
||||||
error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2);
|
|
||||||
if (QNN_SUCCESS != error) {
|
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
|
||||||
goto failure;
|
|
||||||
}
|
|
||||||
|
|
||||||
QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
|
|
||||||
QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0);
|
|
||||||
QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
|
|
||||||
QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
|
|
||||||
QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1);
|
|
||||||
QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
|
|
||||||
QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output;
|
|
||||||
QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst);
|
|
||||||
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
|
|
||||||
|
|
||||||
if (ctx->device != QNN_BACKEND_NPU) {
|
|
||||||
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data,
|
|
||||||
qnn_get_ggml_tensor_data_size(src0)};
|
|
||||||
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
|
|
||||||
qnn_get_ggml_tensor_data_size(src1)};
|
|
||||||
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
|
|
||||||
qnn_get_ggml_tensor_data_size(dst)};
|
|
||||||
} else {
|
|
||||||
uint8_t * qnn_buffer_0 = nullptr;
|
|
||||||
uint8_t * qnn_buffer_1 = nullptr;
|
|
||||||
uint8_t * qnn_buffer_2 = nullptr;
|
|
||||||
qnn_instance * instance = ctx->instance;
|
|
||||||
|
|
||||||
qnn_buffer_0 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src0), 4));
|
|
||||||
if (nullptr == qnn_buffer_0) {
|
|
||||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
|
||||||
} else {
|
|
||||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
|
||||||
}
|
|
||||||
instance->register_rpcmem(qnn_buffer_0, tensor_0);
|
|
||||||
memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
|
|
||||||
|
|
||||||
qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src1), 4));
|
|
||||||
if (nullptr == qnn_buffer_1) {
|
|
||||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
|
||||||
} else {
|
|
||||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
|
||||||
}
|
|
||||||
instance->register_rpcmem(qnn_buffer_1, tensor_1);
|
|
||||||
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
|
|
||||||
|
|
||||||
qnn_buffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(dst), 4));
|
|
||||||
if (nullptr == qnn_buffer_2) {
|
|
||||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
|
||||||
} else {
|
|
||||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
|
||||||
}
|
|
||||||
instance->register_rpcmem(qnn_buffer_2, tensor_2);
|
|
||||||
}
|
|
||||||
|
|
||||||
Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1};
|
|
||||||
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
|
|
||||||
Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1,
|
|
||||||
.v1 = {qnn_op_config_name.c_str(),
|
|
||||||
QNN_OP_PACKAGE_NAME_QTI_AISW,
|
|
||||||
qnn_op_name, 0, qnn_params, 2,
|
|
||||||
tensor_inputs, 1, tensor_outputs}};
|
|
||||||
error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
|
|
||||||
if (QNN_SUCCESS != error) {
|
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
|
||||||
goto failure;
|
|
||||||
}
|
|
||||||
error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr);
|
|
||||||
if (QNN_SUCCESS != error) {
|
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
|
||||||
goto failure;
|
|
||||||
}
|
|
||||||
error = qnn_raw_interface.graphExecute(graph_handle,
|
|
||||||
tensor_inputs, 2,
|
|
||||||
tensor_outputs, 1,
|
|
||||||
nullptr, nullptr);
|
|
||||||
if (QNN_SUCCESS != error) {
|
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
|
||||||
goto failure;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ctx->device == QNN_BACKEND_NPU) {
|
|
||||||
uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
|
|
||||||
QNN_VER_PTR(*tensor_2)->memHandle));
|
|
||||||
memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
|
|
||||||
}
|
|
||||||
|
|
||||||
auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2);
|
|
||||||
instance->_qnn_graph_map[map_entry] = graph_item;
|
|
||||||
} else {
|
|
||||||
auto & graph_item = instance->_qnn_graph_map[map_entry];
|
|
||||||
graph_handle = std::get<0>(graph_item);
|
|
||||||
tensor_0 = std::get<1>(graph_item);
|
|
||||||
tensor_1 = std::get<2>(graph_item);
|
|
||||||
tensor_2 = std::get<3>(graph_item);
|
|
||||||
|
|
||||||
uint32_t dimensions_input_0[] = {
|
|
||||||
(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
|
|
||||||
(uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
|
|
||||||
uint32_t dimensions_input_1[] = {
|
|
||||||
(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
|
|
||||||
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
|
|
||||||
uint32_t dimensions_output[] = {
|
|
||||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],
|
|
||||||
(uint32_t) dst->ne[3]};
|
|
||||||
QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
|
|
||||||
QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0);
|
|
||||||
QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
|
|
||||||
QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
|
|
||||||
QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1);
|
|
||||||
QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
|
|
||||||
QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output;
|
|
||||||
QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst);
|
|
||||||
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
|
|
||||||
|
|
||||||
if (ctx->device != QNN_BACKEND_NPU) {
|
|
||||||
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data,
|
|
||||||
qnn_get_ggml_tensor_data_size(src0)};
|
|
||||||
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
|
|
||||||
qnn_get_ggml_tensor_data_size(src1)};
|
|
||||||
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
|
|
||||||
qnn_get_ggml_tensor_data_size(dst)};
|
|
||||||
} else {
|
|
||||||
uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
|
|
||||||
QNN_VER_PTR(*tensor_0)->memHandle));
|
|
||||||
if (nullptr != qnn_buffer_0)
|
|
||||||
memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
|
|
||||||
|
|
||||||
uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
|
|
||||||
QNN_VER_PTR(*tensor_1)->memHandle));
|
|
||||||
if (nullptr != qnn_buffer_1)
|
|
||||||
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
|
|
||||||
}
|
|
||||||
|
|
||||||
Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1};
|
|
||||||
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
|
|
||||||
error = qnn_raw_interface.graphExecute(graph_handle,
|
|
||||||
tensor_inputs, 2,
|
|
||||||
tensor_outputs, 1,
|
|
||||||
nullptr, nullptr);
|
|
||||||
if (QNN_SUCCESS != error) {
|
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
|
||||||
goto failure;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ctx->device == QNN_BACKEND_NPU) {
|
|
||||||
uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
|
|
||||||
QNN_VER_PTR(*tensor_2)->memHandle));
|
|
||||||
if (nullptr != qnn_buffer_2)
|
|
||||||
memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
failure:
|
|
||||||
if (QNN_SUCCESS != error) {
|
|
||||||
QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0));
|
|
||||||
QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1));
|
|
||||||
QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2));
|
|
||||||
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
|
|
||||||
" x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
|
||||||
src0->name, src0->type, ggml_type_name(src0->type),
|
|
||||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0],
|
|
||||||
src0->nb[1], src0->nb[2]);
|
|
||||||
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
|
|
||||||
" x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
|
||||||
src1->name, src1->type, ggml_type_name(src1->type),
|
|
||||||
src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0],
|
|
||||||
src1->nb[1], src1->nb[2]);
|
|
||||||
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
|
|
||||||
" x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
|
||||||
dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0],
|
|
||||||
dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]);
|
|
||||||
QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2],
|
|
||||||
src0->ne[3]);
|
|
||||||
}
|
|
||||||
|
|
||||||
QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions;
|
|
||||||
QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions;
|
|
||||||
QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions;
|
|
||||||
perf.info();
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx,
|
static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx,
|
||||||
const ggml_tensor * src0, const ggml_tensor * src1,
|
const ggml_tensor * src0, const ggml_tensor * src1,
|
||||||
ggml_tensor * dst) {
|
ggml_tensor * dst) {
|
||||||
|
@ -3038,21 +2819,14 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx,
|
||||||
struct ggml_compute_params * params,
|
struct ggml_compute_params * params,
|
||||||
struct ggml_tensor * tensor) {
|
struct ggml_tensor * tensor) {
|
||||||
ggml_qnn_func_t func = nullptr;
|
ggml_qnn_func_t func = nullptr;
|
||||||
ggml_qnn_func_common_t func_common = nullptr;
|
|
||||||
|
|
||||||
switch (tensor->op) {
|
switch (tensor->op) {
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
func = ggml_qnn_add;
|
func = ggml_qnn_add;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case GGML_OP_MUL:
|
|
||||||
func_common = ggml_qnn_hanlde_op;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
func = ggml_qnn_mul_mat;
|
func = ggml_qnn_mul_mat;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case GGML_OP_REPEAT:
|
case GGML_OP_REPEAT:
|
||||||
func = ggml_qnn_repeat;
|
func = ggml_qnn_repeat;
|
||||||
break;
|
break;
|
||||||
|
@ -3062,15 +2836,12 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx,
|
||||||
case GGML_OP_DUP:
|
case GGML_OP_DUP:
|
||||||
func = ggml_qnn_dup;
|
func = ggml_qnn_dup;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case GGML_OP_ACC:
|
case GGML_OP_ACC:
|
||||||
func = ggml_qnn_acc;
|
func = ggml_qnn_acc;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case GGML_OP_DIV:
|
case GGML_OP_DIV:
|
||||||
func = ggml_qnn_div;
|
func = ggml_qnn_div;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(tensor)) {
|
switch (ggml_get_unary_op(tensor)) {
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
|
@ -3169,10 +2940,9 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx,
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nullptr != func) func(ctx, tensor->src[0], tensor->src[1], tensor);
|
if (nullptr != func) {
|
||||||
|
func(ctx, tensor->src[0], tensor->src[1], tensor);
|
||||||
if (nullptr != func_common)
|
}
|
||||||
func_common(ctx, tensor->op, tensor->src[0], tensor->src[1], tensor);
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -3221,41 +2991,28 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t
|
||||||
}
|
}
|
||||||
Qnn_Tensor_t qnn_tensor = QNN_TENSOR_INIT;
|
Qnn_Tensor_t qnn_tensor = QNN_TENSOR_INIT;
|
||||||
|
|
||||||
if (ctx->device != QNN_BACKEND_GPU) {
|
Qnn_TensorMemType_t qnn_mem_type = QNN_TENSORMEMTYPE_RAW;
|
||||||
qnn_tensor = {
|
if (ctx->device == QNN_BACKEND_GPU) {
|
||||||
.version = QNN_TENSOR_VERSION_1,
|
qnn_mem_type = QNN_TENSORMEMTYPE_MEMHANDLE;
|
||||||
{.v1 = {.id = 0,
|
|
||||||
.name = tensor_name,
|
|
||||||
.type = qnn_tensor_type,
|
|
||||||
.dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
|
|
||||||
.dataType = qnn_data_type,
|
|
||||||
.quantizeParams =
|
|
||||||
{QNN_DEFINITION_UNDEFINED,
|
|
||||||
QNN_QUANTIZATION_ENCODING_UNDEFINED,
|
|
||||||
{.scaleOffsetEncoding = {.scale = 0.0000000000000000f,
|
|
||||||
.offset = 0}}},
|
|
||||||
.rank = qnn_get_ggml_tensor_rank(tensor),
|
|
||||||
.dimensions = dimensions,
|
|
||||||
.memType = QNN_TENSORMEMTYPE_RAW,
|
|
||||||
{.clientBuf = {.data = nullptr, .dataSize = 0}}}}};
|
|
||||||
} else {
|
|
||||||
qnn_tensor = {
|
|
||||||
.version = QNN_TENSOR_VERSION_1,
|
|
||||||
{.v1 = {.id = 0,
|
|
||||||
.name = tensor_name,
|
|
||||||
.type = qnn_tensor_type,
|
|
||||||
.dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
|
|
||||||
.dataType = qnn_data_type,
|
|
||||||
.quantizeParams =
|
|
||||||
{QNN_DEFINITION_UNDEFINED,
|
|
||||||
QNN_QUANTIZATION_ENCODING_UNDEFINED,
|
|
||||||
{.scaleOffsetEncoding = {.scale = 0.0000000000000000f,
|
|
||||||
.offset = 0}}},
|
|
||||||
.rank = qnn_get_ggml_tensor_rank(tensor),
|
|
||||||
.dimensions = dimensions,
|
|
||||||
.memType = QNN_TENSORMEMTYPE_MEMHANDLE,
|
|
||||||
{.clientBuf = {.data = nullptr, .dataSize = 0}}}}};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
qnn_tensor = {
|
||||||
|
.version = QNN_TENSOR_VERSION_1,
|
||||||
|
{.v1 = {.id = 0,
|
||||||
|
.name = tensor_name,
|
||||||
|
.type = qnn_tensor_type,
|
||||||
|
.dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
|
||||||
|
.dataType = qnn_data_type,
|
||||||
|
.quantizeParams =
|
||||||
|
{QNN_DEFINITION_UNDEFINED,
|
||||||
|
QNN_QUANTIZATION_ENCODING_UNDEFINED,
|
||||||
|
{.scaleOffsetEncoding = {.scale = 0.0000000000000000f,
|
||||||
|
.offset = 0}}},
|
||||||
|
.rank = qnn_get_ggml_tensor_rank(tensor),
|
||||||
|
.dimensions = dimensions,
|
||||||
|
.memType = qnn_mem_type,
|
||||||
|
{.clientBuf = {.data = nullptr, .dataSize = 0}}}}};
|
||||||
|
|
||||||
Qnn_Tensor_t * p_qnn_tensor =
|
Qnn_Tensor_t * p_qnn_tensor =
|
||||||
(Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
|
(Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
|
||||||
if (nullptr == p_qnn_tensor) {
|
if (nullptr == p_qnn_tensor) {
|
||||||
|
|
|
@ -12,8 +12,8 @@ ANDROID_PLATFORM=android-34
|
||||||
|
|
||||||
GGML_QNN_UT=ggml-qnn-ut
|
GGML_QNN_UT=ggml-qnn-ut
|
||||||
REMOTE_PATH=/data/local/tmp/
|
REMOTE_PATH=/data/local/tmp/
|
||||||
BUILDTYPE=Debug
|
|
||||||
BUILDTYPE=Release
|
BUILDTYPE=Release
|
||||||
|
BUILDTYPE=Debug
|
||||||
|
|
||||||
|
|
||||||
function dump_vars()
|
function dump_vars()
|
||||||
|
@ -100,7 +100,7 @@ function update_qnn_libs()
|
||||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/
|
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/
|
||||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/
|
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/
|
||||||
|
|
||||||
#the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully
|
#the QNN NPU(aka HTP) backend only verified on Qualcomm Snapdragon 8 Gen 3 equipped Android phone
|
||||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/
|
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/
|
||||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/
|
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/
|
||||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/
|
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/
|
||||||
|
@ -142,14 +142,9 @@ function run_ggml_qnn_ut()
|
||||||
|
|
||||||
case "$ggmlop" in
|
case "$ggmlop" in
|
||||||
GGML_OP_ADD)
|
GGML_OP_ADD)
|
||||||
echo "adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend"
|
|
||||||
adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend
|
adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend
|
||||||
;;
|
;;
|
||||||
|
|
||||||
GGML_OP_MUL)
|
|
||||||
adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL -b $qnnbackend
|
|
||||||
;;
|
|
||||||
|
|
||||||
GGML_OP_MUL_MAT)
|
GGML_OP_MUL_MAT)
|
||||||
adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL_MAT -b $qnnbackend
|
adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL_MAT -b $qnnbackend
|
||||||
;;
|
;;
|
||||||
|
@ -169,7 +164,6 @@ function show_usage()
|
||||||
echo " $0 build (build Android command line UT program)"
|
echo " $0 build (build Android command line UT program)"
|
||||||
echo " $0 updateqnnlibs (upload the latest QNN libs to Android phone)"
|
echo " $0 updateqnnlibs (upload the latest QNN libs to Android phone)"
|
||||||
echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)"
|
echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)"
|
||||||
echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)"
|
|
||||||
echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)"
|
echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)"
|
||||||
echo -e "\n\n\n"
|
echo -e "\n\n\n"
|
||||||
}
|
}
|
||||||
|
|
|
@ -346,7 +346,7 @@ static void show_usage() {
|
||||||
"\nUsage: test_qnn_ops [options]\n" \
|
"\nUsage: test_qnn_ops [options]\n" \
|
||||||
"\n" \
|
"\n" \
|
||||||
"Options:\n" \
|
"Options:\n" \
|
||||||
" -t GGML_OP_ADD / GGML_OP_MUL / GGML_OP_MULMAT\n" \
|
" -t GGML_OP_ADD / GGML_OP_MULMAT\n" \
|
||||||
" -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(ggml)\n" \
|
" -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(ggml)\n" \
|
||||||
" ?/h print usage infomation\n\n"
|
" ?/h print usage infomation\n\n"
|
||||||
);
|
);
|
||||||
|
@ -418,13 +418,9 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) {
|
||||||
QNN_LOG_DEBUG("sizex: %d\n", sizex);
|
QNN_LOG_DEBUG("sizex: %d\n", sizex);
|
||||||
QNN_LOG_DEBUG("sizey: %d\n", sizey);
|
QNN_LOG_DEBUG("sizey: %d\n", sizey);
|
||||||
|
|
||||||
if (n_ggml_op_type == GGML_OP_MUL) {
|
|
||||||
src0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
|
|
||||||
src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
|
|
||||||
} else {
|
|
||||||
src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
||||||
src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
|
src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
|
||||||
}
|
|
||||||
ggml_set_input(src0);
|
ggml_set_input(src0);
|
||||||
ggml_set_input(src1);
|
ggml_set_input(src1);
|
||||||
|
|
||||||
|
@ -432,9 +428,6 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) {
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
dst = ggml_add(ctx, src0, src1);
|
dst = ggml_add(ctx, src0, src1);
|
||||||
break;
|
break;
|
||||||
case GGML_OP_MUL:
|
|
||||||
dst = ggml_mul(ctx, src0, src1);
|
|
||||||
break;
|
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
dst = ggml_mul_mat(ctx, src0, src1);
|
dst = ggml_mul_mat(ctx, src0, src1);
|
||||||
break;
|
break;
|
||||||
|
@ -518,8 +511,6 @@ int main(int argc, char * argv[]) {
|
||||||
n_ggml_op_type = GGML_OP_ADD;
|
n_ggml_op_type = GGML_OP_ADD;
|
||||||
} else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) {
|
} else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) {
|
||||||
n_ggml_op_type = GGML_OP_MUL_MAT;
|
n_ggml_op_type = GGML_OP_MUL_MAT;
|
||||||
} else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL", 11)) {
|
|
||||||
n_ggml_op_type = GGML_OP_MUL;
|
|
||||||
} else {
|
} else {
|
||||||
show_usage();
|
show_usage();
|
||||||
return 1;
|
return 1;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue