ggml-qnn: refine ggml inference using QNN NPU

This commit is contained in:
zhou.weiguo 2024-06-12 16:30:50 +08:00
parent 5269e082aa
commit faaa86b7e4
No known key found for this signature in database
GPG key ID: 952EA81D18BB2FA8
3 changed files with 504 additions and 169 deletions

View file

@ -1001,12 +1001,10 @@ class qnn_instance {
_qnn_interface.set_qnn_interface(_loaded_backend[backend_id]);
_qnn_interface.qnn_log_create(qnn_sdk_logcallback, _qnn_log_level,
&_qnn_log_handle);
_qnn_interface.qnn_log_create(qnn_sdk_logcallback, _qnn_log_level, &_qnn_log_handle);
if (nullptr == _qnn_log_handle) {
QNN_LOG_WARN(
"why failed to initialize qnn log\n"); // NPU backend not work on
// Qualcomm SoC equipped low-end phone
// NPU backend not work on Qualcomm SoC equipped low-end phone
QNN_LOG_WARN("why failed to initialize qnn log\n");
return 4;
} else {
QNN_LOG_DEBUG("initialize qnn log successfully\n");
@ -1025,23 +1023,62 @@ class qnn_instance {
}
if (nullptr != _qnn_raw_interface.propertyHasCapability) {
auto qnnStatus =
Qnn_ErrorHandle_t qnn_status =
_qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE);
if (QNN_PROPERTY_NOT_SUPPORTED == qnnStatus) {
if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) {
QNN_LOG_WARN("device property is not supported\n");
}
if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnStatus) {
if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) {
QNN_LOG_WARN("device property is not known to backend\n");
}
}
Qnn_ErrorHandle_t qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr,
&_qnn_device_handle);
Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS;
if (_backend_name.find("Htp") != std::variant_npos) {
const QnnDevice_PlatformInfo_t * p_info = nullptr;
_qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices);
QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices;
QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = { };
for (int i = 0; i < p_info->v1.numHwDevices; i++) {
QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId,
infos[i].v1.deviceType, infos[i].v1.numCores);
QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
chipinfo = devinfo->onChipDevice;
QnnHtpDevice_Arch_t htp_arch = chipinfo.arch;
QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : "");
QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", \
chipinfo.socModel, qnn_get_chipset_desc(chipinfo.socModel), \
htp_arch, qnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize);
g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize };
}
_qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
QnnHtpDevice_CustomConfig_t soc_customconfig;
soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC;
soc_customconfig.socModel = chipinfo.socModel;
QnnDevice_Config_t soc_devconfig;
soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
soc_devconfig.customConfig = &soc_customconfig;
QnnHtpDevice_CustomConfig_t arch_customconfig;
arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH;
arch_customconfig.arch.arch = chipinfo.arch;
arch_customconfig.arch.deviceId = 0; // Id of device to be used. If single device is used by default 0.
QnnDevice_Config_t arch_devconfig;
arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
arch_devconfig.customConfig = &arch_customconfig;
const QnnDevice_Config_t * p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, NULL};
qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle);
} else {
qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle);
}
if (QNN_SUCCESS != qnn_status &&
QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) {
QNN_LOG_WARN("failed to create QNN device\n");
} else {
QNN_LOG_INFO("create device successfully\n");
QNN_LOG_INFO("create QNN device successfully\n");
}
if (qnn_sdk_profile_level::profile_off != _profile_level) {
@ -1096,9 +1133,9 @@ class qnn_instance {
return 9;
}
if (nullptr !=
_pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy
if (nullptr != _pfn_rpc_mem_init) { // make Qualcomm's SoC equipped low-end phone happy
_pfn_rpc_mem_init();
}
std::vector<const QnnContext_Config_t *> temp_context_config;
_qnn_interface.qnn_context_create(
@ -1113,32 +1150,14 @@ class qnn_instance {
}
if (_backend_name.find("Htp") != std::variant_npos) {
const QnnDevice_PlatformInfo_t * p_info = nullptr;
_qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices);
QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices;
for (int i = 0; i < p_info->v1.numHwDevices; i++) {
QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId,
infos[i].v1.deviceType, infos[i].v1.numCores);
QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice;
QnnHtpDevice_Arch_t htp_arch = chipinfo.arch;
QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : "");
QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", \
chipinfo.socModel, qnn_get_chipset_desc(chipinfo.socModel), \
htp_arch, qnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize);
g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize };
}
_qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
//TODO: faster approach to probe the accurate capacity of rpc ion memory
size_t candidate_size = 0;
uint8_t * rpc_buffer = nullptr;
const int SIZE_IN_MB = (1 << 20);
const int size_in_mb = (1 << 20);
size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048};
size_t probe_counts = sizeof(probe_slots) / sizeof(size_t);
for (size_t idx = 0; idx < probe_counts; idx++) {
rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem(probe_slots[idx] * SIZE_IN_MB, 4));
rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem(probe_slots[idx] * size_in_mb, 4));
if (nullptr == rpc_buffer) {
QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
break;
@ -1150,7 +1169,7 @@ class qnn_instance {
}
if (candidate_size > _rpcmem_capacity)
_rpcmem_capacity = candidate_size;
QNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity);
QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", _rpcmem_capacity);
if (0 != init_htp_perfinfra()) {
QNN_LOG_WARN("initialize HTP performance failure");
@ -1181,6 +1200,10 @@ class qnn_instance {
QNN_LOG_DEBUG("succeed to close rpcmem lib\n");
}
if (_backend_name.find("Htp") != std::variant_npos) {
_qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid);
}
if (nullptr != _qnn_context_handle) {
error = _qnn_interface.qnn_context_free(_qnn_context_handle,
_qnn_profile_handle);
@ -1239,6 +1262,9 @@ class qnn_instance {
return ret_status;
}
//keep it for further usage of offload the entire cgraph to a single QNN DAG directly
//which was used in Qualcomm's dedicated AI technology
#if 0
int init_qnn_graph(const char * graph_name, bool debug,
uint8_t do_node_validation = true,
const QnnGraph_Config_t ** graph_configs = nullptr) {
@ -1288,6 +1314,7 @@ class qnn_instance {
return 0;
}
#endif
const qnn_interface & get_qnn_interface() {
if (!_qnn_interface.is_loaded()) {
@ -1362,70 +1389,86 @@ class qnn_instance {
}
int set_rpc_polling() {
if (_qnn_rpc_pollingtime > 0) {
QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingTime;
memset(&rpc_pollingTime, 0, sizeof(rpc_pollingTime));
rpc_pollingTime.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
rpc_pollingTime.rpcPollingTimeConfig = _qnn_rpc_pollingtime;
if (_qnn_htp_perfinfra) {
QnnHtpPerfInfrastructure_PowerConfig_t rpc_polling_time;
memset(&rpc_polling_time, 0, sizeof(rpc_polling_time));
rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
//use rpc polling time recommended 0-10000 us
rpc_polling_time.rpcPollingTimeConfig = 9999;
QnnHtpPerfInfrastructure_PowerConfig_t rpc_ControlLatency;
memset(&rpc_ControlLatency, 0, sizeof(rpc_ControlLatency));
rpc_ControlLatency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY;
rpc_ControlLatency.rpcControlLatencyConfig = 40;
QnnHtpPerfInfrastructure_PowerConfig_t rpc_control_latency;
memset(&rpc_control_latency, 0, sizeof(rpc_control_latency));
rpc_control_latency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY;
//use rpc control latency recommended 100 us, refer hexagon sdk
rpc_control_latency.rpcControlLatencyConfig = 100;
const QnnHtpPerfInfrastructure_PowerConfig_t * powerConfigs[] = {&rpc_pollingTime, &rpc_ControlLatency, nullptr};
if (_qnn_htp_perfinfra) {
_qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs);
const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {
&rpc_polling_time,
&rpc_control_latency,
nullptr};
Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig(
_qnn_power_configid,
power_configs);
if (qnn_status != QNN_SUCCESS) {
QNN_LOG_WARN("set htp perf failed\n");
} else {
QNN_LOG_INFO("set htp perf ok\n");
}
} else {
QNN_LOG_WARN("can't set htp perf\n");
}
return 0;
}
int set_high_performance_mode() {
if (nullptr == _qnn_htp_perfinfra) {
QNN_LOG_DEBUG("perf intra is null\n");
QNN_LOG_WARN("perf intra is null\n");
return 1;
}
QnnHtpPerfInfrastructure_PowerConfig_t powerConfig;
memset(&powerConfig, 0, sizeof(powerConfig));
powerConfig.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
powerConfig.dcvsV3Config.dcvsEnable = 0;
powerConfig.dcvsV3Config.setDcvsEnable = 1;
powerConfig.dcvsV3Config.contextId = _qnn_power_configid;
powerConfig.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
powerConfig.dcvsV3Config.setSleepLatency =
1; // true to consider Latency parameter otherwise False
powerConfig.dcvsV3Config.setBusParams =
1; // true to consider Bus parameter otherwise False
powerConfig.dcvsV3Config.setCoreParams =
1; // true to consider Core parameter otherwise False
powerConfig.dcvsV3Config.sleepDisable =
0; // true to consider sleep/LPM modes, False to enable
powerConfig.dcvsV3Config.setSleepDisable =
0; // true to consider sleep disable/enable parameter otherwise False set sleep latency parameter
uint32_t latencyValue = 40;
powerConfig.dcvsV3Config.sleepLatency =
latencyValue; // range 40-2000 micro sec
QnnHtpPerfInfrastructure_PowerConfig_t power_config;
memset(&power_config, 0, sizeof(power_config));
power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
power_config.dcvsV3Config.dcvsEnable = 0;
power_config.dcvsV3Config.setDcvsEnable = 1;
power_config.dcvsV3Config.contextId = _qnn_power_configid;
power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
power_config.dcvsV3Config.setSleepLatency =
1; // true to consider Latency parameter otherwise false
power_config.dcvsV3Config.sleepLatency = 10;
power_config.dcvsV3Config.setBusParams =
1; // true to consider Bus parameter otherwise false
power_config.dcvsV3Config.setCoreParams =
1; // true to consider Core parameter otherwise false
power_config.dcvsV3Config.sleepDisable =
1; // true to consider sleep/LPM modes, false to enable
power_config.dcvsV3Config.setSleepDisable =
1; // true to consider sleep disable/enable parameter otherwise false set sleep latency parameter
// set Bus Clock Parameters
powerConfig.dcvsV3Config.busVoltageCornerMin =
power_config.dcvsV3Config.busVoltageCornerMin =
DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
powerConfig.dcvsV3Config.busVoltageCornerTarget =
power_config.dcvsV3Config.busVoltageCornerTarget =
DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
powerConfig.dcvsV3Config.busVoltageCornerMax =
power_config.dcvsV3Config.busVoltageCornerMax =
DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
// set Core Clock Parameters
powerConfig.dcvsV3Config.coreVoltageCornerMin =
power_config.dcvsV3Config.coreVoltageCornerMin =
DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
powerConfig.dcvsV3Config.coreVoltageCornerTarget =
power_config.dcvsV3Config.coreVoltageCornerTarget =
DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
powerConfig.dcvsV3Config.coreVoltageCornerMax =
power_config.dcvsV3Config.coreVoltageCornerMax =
DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
// set power config with different performance parameters
const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = {
&powerConfig, nullptr};
_qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs);
const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {
&power_config, nullptr};
Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS;
qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
if (qnn_status != QNN_SUCCESS) {
QNN_LOG_WARN("set htp high performance mode failed\n");
} else {
QNN_LOG_INFO("set htp high performance mode ok\n");
}
return 0;
}
@ -1505,7 +1548,7 @@ class qnn_instance {
if (is_rpcmem_allocated(p_data)) {
QNN_LOG_WARN("rpc memory already allocated\n");
// return 3;
return 3;
}
if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) {
QNN_LOG_WARN("tensor %s has been registered shared memory\n",
@ -1518,7 +1561,7 @@ class qnn_instance {
QNN_LOG_WARN("failed to get file descriptor\n");
return 5;
}
QNN_LOG_DEBUG("mem_fd %d\n", mem_fd);
QNN_LOG_INFO("mem_fd %d\n", mem_fd);
Qnn_MemDescriptor_t descriptor = {{QNN_VER_PTR(*p_tensor)->rank,
QNN_VER_PTR(*p_tensor)->dimensions,
nullptr},
@ -1538,11 +1581,24 @@ class qnn_instance {
(QNN_VER_PTR(*p_tensor)->name));
}
QNN_VER_PTR(*p_tensor)->memHandle = handle;
_qnn_mem_set.insert(handle);
_qnn_mem_set.insert((std::pair<void*, Qnn_MemHandle_t>(p_data, handle)));
return 0;
}
void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) {
for (std::unordered_map<void *, Qnn_MemHandle_t>::iterator it = _qnn_mem_set.begin();
it != _qnn_mem_set.end();
it++) {
Qnn_MemHandle_t mem_handle = it->second;
if (it->second == mem_handle) {
return it->first;
}
}
QNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle);
return nullptr;
}
void unregister_rpcmem() {
Qnn_ErrorHandle_t error = QNN_SUCCESS;
@ -1550,7 +1606,10 @@ class qnn_instance {
QNN_LOG_WARN("no rpcmem registered\n");
}
for (auto & mem_handle : _qnn_mem_set) {
for (std::unordered_map<void *, Qnn_MemHandle_t>::iterator it = _qnn_mem_set.begin();
it != _qnn_mem_set.end();
it++) {
Qnn_MemHandle_t mem_handle = it->second;
error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1);
if (error != QNN_SUCCESS) {
QNN_LOG_WARN("failed to unregister shared memory, error %d\n",
@ -1561,7 +1620,7 @@ class qnn_instance {
}
bool is_rpcmem_allocated(void * buf) {
return _rpcmem_store_map.count(buf) != 0U;
return _qnn_mem_set.count(buf) != 0U;
}
@ -1686,8 +1745,9 @@ class qnn_instance {
return 1;
}
auto get_providers = load_qnn_functionpointers<pfn_qnninterface_getproviders *>(
lib_handle, "QnnInterface_getProviders");
auto get_providers =
load_qnn_functionpointers<pfn_qnninterface_getproviders *>(
lib_handle, "QnnInterface_getProviders");
if (nullptr == get_providers) {
QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s",
dlerror());
@ -1786,7 +1846,7 @@ class qnn_instance {
private:
std::string _lib_path;
std::string _backend_name;
std::string _model_name; // prebuilt QNN model name, not used currently
std::string _model_name; // Qualcomm's dedicated prebuilt model name, keep it for further usage
BackendIdType _backend_id;
bool _debug_tensor = false;
@ -1816,12 +1876,11 @@ class qnn_instance {
QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr;
uint32_t _qnn_power_configid = 1;
uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing
QNN_INTERFACE_VER_TYPE _qnn_raw_interface;
QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface;
std::unordered_set<Qnn_MemHandle_t> _qnn_mem_set;
std::unordered_map<void *, Qnn_MemHandle_t> _qnn_mem_set;
std::mutex _init_mutex;
std::unordered_map<BackendIdType, void *> _loaded_lib_handle;
@ -1898,9 +1957,8 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx,
if (tensor->op == GGML_OP_MUL_MAT) {
if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) {
return false;
} else {
return true;
//make mul_mat with QNN RPC happy
//return false;
}
}
@ -1964,17 +2022,29 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
if (!graph_initialized) {
graph_name = graph_name + "_" + std::to_string(ctx->threads) +
src0->name + "_" + src1->name;
"_" + src0->name + "_" + src1->name;
QNN_LOG_INFO("graph name %s", graph_name.c_str());
if (ctx->device == QNN_BACKEND_NPU) {
QnnHtpGraph_CustomConfig_t custom_config;
custom_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
custom_config.numHvxThreads = 8;
QnnHtpGraph_CustomConfig_t hvx_config;
hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
hvx_config.numHvxThreads = 8;
QnnGraph_Config_t graph_hvx_config;
graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
graph_hvx_config.customConfig = &hvx_config;
QnnGraph_Config_t graph_config;
graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
graph_config.customConfig = &custom_config;
const QnnGraph_Config_t * p_graphconfig[] = {&graph_config, NULL};
QnnHtpGraph_CustomConfig_t dlbc_config;
dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
/*
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC
*/
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
QnnGraph_Config_t graph_dlbc_config;
graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
graph_dlbc_config.customConfig = &dlbc_config;
const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL};
error = qnn_raw_interface.graphCreate(
instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig,
&graph_handle);
@ -1989,7 +2059,21 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
"error = %d\n",
graph_name.c_str(), error);
goto failure;
} else {
QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str());
}
if (ctx->device == QNN_BACKEND_NPU) {
QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0};
QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0};
QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0};
}
error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0);
if (QNN_SUCCESS != error) {
QNN_LOG_INFO("error = %d\n", error);
@ -2006,13 +2090,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
goto failure;
}
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data,
qnn_get_ggml_tensor_data_size(src0)};
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
qnn_get_ggml_tensor_data_size(src1)};
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
qnn_get_ggml_tensor_data_size(dst)};
QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0);
QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
@ -2023,6 +2100,46 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst);
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
if (ctx->device != QNN_BACKEND_NPU) {
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data,
qnn_get_ggml_tensor_data_size(src0)};
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
qnn_get_ggml_tensor_data_size(src1)};
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
qnn_get_ggml_tensor_data_size(dst)};
} else {
uint8_t * qnn_buffer_0 = nullptr;
uint8_t * qnn_buffer_1 = nullptr;
uint8_t * qnn_buffer_2 = nullptr;
qnn_instance * instance = ctx->instance;
qnn_buffer_0 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src0), 4));
if (nullptr == qnn_buffer_0) {
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
} else {
QNN_LOG_INFO("alloc rpcmem successfully\n");
}
instance->register_rpcmem(qnn_buffer_0, tensor_0);
memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src1), 4));
if (nullptr == qnn_buffer_1) {
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
} else {
QNN_LOG_INFO("alloc rpcmem successfully\n");
}
instance->register_rpcmem(qnn_buffer_1, tensor_1);
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
qnn_buffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(dst), 4));
if (nullptr == qnn_buffer_2) {
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
} else {
QNN_LOG_INFO("alloc rpcmem successfully\n");
}
instance->register_rpcmem(qnn_buffer_2, tensor_2);
}
Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1};
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
Qnn_OpConfig_t op_config = {
@ -2048,6 +2165,12 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
QNN_LOG_INFO("error = %d\n", error);
goto failure;
}
if (ctx->device == QNN_BACKEND_NPU) {
uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
QNN_VER_PTR(*tensor_2)->memHandle));
memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
}
auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2);
instance->_qnn_graph_map[map_entry] = graph_item;
} else {
@ -2067,13 +2190,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],
(uint32_t) dst->ne[3]};
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data,
qnn_get_ggml_tensor_data_size(src0)};
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
qnn_get_ggml_tensor_data_size(src1)};
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
qnn_get_ggml_tensor_data_size(dst)};
QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0);
QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
@ -2084,6 +2200,25 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst);
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
if (ctx->device != QNN_BACKEND_NPU) {
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data,
qnn_get_ggml_tensor_data_size(src0)};
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
qnn_get_ggml_tensor_data_size(src1)};
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
qnn_get_ggml_tensor_data_size(dst)};
} else {
uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
QNN_VER_PTR(*tensor_0)->memHandle));
if (nullptr != qnn_buffer_0)
memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
QNN_VER_PTR(*tensor_1)->memHandle));
if (nullptr != qnn_buffer_1)
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
}
Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1};
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2,
@ -2093,7 +2228,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
QNN_LOG_INFO("error = %d\n", error);
goto failure;
}
if (ctx->device == QNN_BACKEND_NPU) {
uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
QNN_VER_PTR(*tensor_2)->memHandle));
if (nullptr != qnn_buffer_2)
memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
}
}
failure:
if (QNN_SUCCESS != error) {
QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0));
@ -2197,17 +2340,55 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
if (!graph_initialized) {
graph_name = graph_name + "_" + std::to_string(ctx->threads) +
src0->name + "_" + src1->name;
"_" + src0->name + "_" + src1->name;
QNN_LOG_INFO("graph name %s", graph_name.c_str());
error = qnn_raw_interface.graphCreate(
instance->get_qnn_context_handle(), graph_name.c_str(), nullptr,
&graph_handle);
if (ctx->device == QNN_BACKEND_NPU) {
QnnHtpGraph_CustomConfig_t hvx_config;
hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
hvx_config.numHvxThreads = 8;
QnnGraph_Config_t graph_hvx_config;
graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
graph_hvx_config.customConfig = &hvx_config;
QnnHtpGraph_CustomConfig_t dlbc_config;
dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
/*
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC
*/
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
QnnGraph_Config_t graph_dlbc_config;
graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
graph_dlbc_config.customConfig = &dlbc_config;
const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL};
error = qnn_raw_interface.graphCreate(
instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig,
&graph_handle);
} else {
error = qnn_raw_interface.graphCreate(
instance->get_qnn_context_handle(), graph_name.c_str(), nullptr,
&graph_handle);
}
if (QNN_SUCCESS != error) {
QNN_LOG_INFO("can't create qnn graph handle with graph name %s, "
"error = %d\n",
graph_name.c_str(), error);
goto failure;
}
if (ctx->device == QNN_BACKEND_NPU) {
QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0};
QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0};
QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0};
}
error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0);
if (QNN_SUCCESS != error) {
QNN_LOG_INFO("error = %d\n", error);
@ -2224,13 +2405,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
goto failure;
}
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data,
qnn_get_ggml_tensor_data_size(src0)};
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
qnn_get_ggml_tensor_data_size(src1)};
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
qnn_get_ggml_tensor_data_size(dst)};
QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0);
QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
@ -2241,6 +2415,46 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst);
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
if (ctx->device != QNN_BACKEND_NPU) {
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data,
qnn_get_ggml_tensor_data_size(src0)};
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
qnn_get_ggml_tensor_data_size(src1)};
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
qnn_get_ggml_tensor_data_size(dst)};
} else {
uint8_t * qnn_buffer_0 = nullptr;
uint8_t * qnn_buffer_1 = nullptr;
uint8_t * qnn_buffer_2 = nullptr;
qnn_instance * instance = ctx->instance;
qnn_buffer_0 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src0), 4));
if (nullptr == qnn_buffer_0) {
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
} else {
QNN_LOG_INFO("alloc rpcmem successfully\n");
}
instance->register_rpcmem(qnn_buffer_0, tensor_0);
memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src1), 4));
if (nullptr == qnn_buffer_1) {
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
} else {
QNN_LOG_INFO("alloc rpcmem successfully\n");
}
instance->register_rpcmem(qnn_buffer_1, tensor_1);
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
qnn_buffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(dst), 4));
if (nullptr == qnn_buffer_2) {
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
} else {
QNN_LOG_INFO("alloc rpcmem successfully\n");
}
instance->register_rpcmem(qnn_buffer_2, tensor_2);
}
Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1};
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1,
@ -2266,6 +2480,13 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
QNN_LOG_INFO("error = %d\n", error);
goto failure;
}
if (ctx->device == QNN_BACKEND_NPU) {
uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
QNN_VER_PTR(*tensor_2)->memHandle));
memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
}
auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2);
instance->_qnn_graph_map[map_entry] = graph_item;
} else {
@ -2294,12 +2515,24 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst);
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data,
qnn_get_ggml_tensor_data_size(src0)};
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
qnn_get_ggml_tensor_data_size(src1)};
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
qnn_get_ggml_tensor_data_size(dst)};
if (ctx->device != QNN_BACKEND_NPU) {
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data,
qnn_get_ggml_tensor_data_size(src0)};
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
qnn_get_ggml_tensor_data_size(src1)};
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
qnn_get_ggml_tensor_data_size(dst)};
} else {
uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
QNN_VER_PTR(*tensor_0)->memHandle));
if (nullptr != qnn_buffer_0)
memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
QNN_VER_PTR(*tensor_1)->memHandle));
if (nullptr != qnn_buffer_1)
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
}
Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1};
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
@ -2311,7 +2544,15 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
QNN_LOG_INFO("error = %d\n", error);
goto failure;
}
if (ctx->device == QNN_BACKEND_NPU) {
uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
QNN_VER_PTR(*tensor_2)->memHandle));
if (nullptr != qnn_buffer_2)
memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
}
}
failure:
if (QNN_SUCCESS != error) {
QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0));
@ -2428,6 +2669,17 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx,
goto failure;
}
if (ctx->device == QNN_BACKEND_NPU) {
QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0};
QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0};
QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0};
}
error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0);
if (QNN_SUCCESS != error) {
QNN_LOG_INFO("error = %d\n", error);
@ -2444,13 +2696,6 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx,
goto failure;
}
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data,
qnn_get_ggml_tensor_data_size(src0)};
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
qnn_get_ggml_tensor_data_size(src1)};
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
qnn_get_ggml_tensor_data_size(dst)};
QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0);
QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
@ -2461,6 +2706,46 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx,
QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst);
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
if (ctx->device != QNN_BACKEND_NPU) {
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data,
qnn_get_ggml_tensor_data_size(src0)};
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
qnn_get_ggml_tensor_data_size(src1)};
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
qnn_get_ggml_tensor_data_size(dst)};
} else {
uint8_t * qnn_buffer_0 = nullptr;
uint8_t * qnn_buffer_1 = nullptr;
uint8_t * qnn_buffer_2 = nullptr;
qnn_instance * instance = ctx->instance;
qnn_buffer_0 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src0), 4));
if (nullptr == qnn_buffer_0) {
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
} else {
QNN_LOG_INFO("alloc rpcmem successfully\n");
}
instance->register_rpcmem(qnn_buffer_0, tensor_0);
memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(src1), 4));
if (nullptr == qnn_buffer_1) {
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
} else {
QNN_LOG_INFO("alloc rpcmem successfully\n");
}
instance->register_rpcmem(qnn_buffer_1, tensor_1);
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
qnn_buffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(dst), 4));
if (nullptr == qnn_buffer_2) {
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
} else {
QNN_LOG_INFO("alloc rpcmem successfully\n");
}
instance->register_rpcmem(qnn_buffer_2, tensor_2);
}
Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1};
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1,
@ -2486,6 +2771,13 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx,
QNN_LOG_INFO("error = %d\n", error);
goto failure;
}
if (ctx->device == QNN_BACKEND_NPU) {
uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
QNN_VER_PTR(*tensor_2)->memHandle));
memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
}
auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2);
instance->_qnn_graph_map[map_entry] = graph_item;
} else {
@ -2514,17 +2806,28 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx,
QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst);
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data,
qnn_get_ggml_tensor_data_size(src0)};
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
qnn_get_ggml_tensor_data_size(src1)};
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
qnn_get_ggml_tensor_data_size(dst)};
if (ctx->device != QNN_BACKEND_NPU) {
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data,
qnn_get_ggml_tensor_data_size(src0)};
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
qnn_get_ggml_tensor_data_size(src1)};
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
qnn_get_ggml_tensor_data_size(dst)};
} else {
uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
QNN_VER_PTR(*tensor_0)->memHandle));
if (nullptr != qnn_buffer_0)
memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
QNN_VER_PTR(*tensor_1)->memHandle));
if (nullptr != qnn_buffer_1)
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
}
Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1};
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
error =
qnn_raw_interface.graphExecute(graph_handle,
error = qnn_raw_interface.graphExecute(graph_handle,
tensor_inputs, 2,
tensor_outputs, 1,
nullptr, nullptr);
@ -2532,7 +2835,15 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx,
QNN_LOG_INFO("error = %d\n", error);
goto failure;
}
if (ctx->device == QNN_BACKEND_NPU) {
uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
QNN_VER_PTR(*tensor_2)->memHandle));
if (nullptr != qnn_buffer_2)
memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
}
}
failure:
if (QNN_SUCCESS != error) {
QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0));
@ -2889,9 +3200,9 @@ GGML_CALL static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t b
GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer,
ggml_tensor * tensor) {
Qnn_ErrorHandle_t error = QNN_SUCCESS;
ggml_backend_qnn_buffer_context * ctx =
(ggml_backend_qnn_buffer_context *) buffer->context;
Qnn_ErrorHandle_t error = QNN_SUCCESS;
ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context;
static int idx = 0;
char tensor_name[GGML_MAX_NAME] = {0};
snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++);
@ -2908,22 +3219,43 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t
} else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) {
qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
}
Qnn_Tensor_t qnn_tensor = {
.version = QNN_TENSOR_VERSION_1,
{.v1 = {.id = 0,
.name = tensor_name,
.type = qnn_tensor_type,
.dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
.dataType = qnn_data_type,
.quantizeParams =
{QNN_DEFINITION_UNDEFINED,
QNN_QUANTIZATION_ENCODING_UNDEFINED,
{.scaleOffsetEncoding = {.scale = 0.0000000000000000f,
.offset = 0}}},
.rank = qnn_get_ggml_tensor_rank(tensor),
.dimensions = dimensions,
.memType = QNN_TENSORMEMTYPE_RAW,
{.clientBuf = {.data = nullptr, .dataSize = 0}}}}};
Qnn_Tensor_t qnn_tensor = QNN_TENSOR_INIT;
if (ctx->device != QNN_BACKEND_GPU) {
qnn_tensor = {
.version = QNN_TENSOR_VERSION_1,
{.v1 = {.id = 0,
.name = tensor_name,
.type = qnn_tensor_type,
.dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
.dataType = qnn_data_type,
.quantizeParams =
{QNN_DEFINITION_UNDEFINED,
QNN_QUANTIZATION_ENCODING_UNDEFINED,
{.scaleOffsetEncoding = {.scale = 0.0000000000000000f,
.offset = 0}}},
.rank = qnn_get_ggml_tensor_rank(tensor),
.dimensions = dimensions,
.memType = QNN_TENSORMEMTYPE_RAW,
{.clientBuf = {.data = nullptr, .dataSize = 0}}}}};
} else {
qnn_tensor = {
.version = QNN_TENSOR_VERSION_1,
{.v1 = {.id = 0,
.name = tensor_name,
.type = qnn_tensor_type,
.dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
.dataType = qnn_data_type,
.quantizeParams =
{QNN_DEFINITION_UNDEFINED,
QNN_QUANTIZATION_ENCODING_UNDEFINED,
{.scaleOffsetEncoding = {.scale = 0.0000000000000000f,
.offset = 0}}},
.rank = qnn_get_ggml_tensor_rank(tensor),
.dimensions = dimensions,
.memType = QNN_TENSORMEMTYPE_MEMHANDLE,
{.clientBuf = {.data = nullptr, .dataSize = 0}}}}};
}
Qnn_Tensor_t * p_qnn_tensor =
(Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
if (nullptr == p_qnn_tensor) {
@ -2933,7 +3265,7 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t
error = deep_copy_qnn_tensors(qnn_tensor, *p_qnn_tensor);
if (error != QNN_SUCCESS) {
free(p_qnn_tensor);
QNN_LOG_DEBUG("init tensor failed");
QNN_LOG_WARN("init tensor failed");
return;
}
tensor->extra = p_qnn_tensor;
@ -3210,6 +3542,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) {
device, GGML_QNN_MAX_DEVICES - 1);
return nullptr;
}
static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES];
static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES];
static bool ggml_backend_qnn_buffer_type_initialized = false;
@ -3307,7 +3640,6 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
std::string device_name = qnn_get_backend_name(device);
QNN_LOG_INFO("qnn device name %s", device_name.c_str());
instance->init_qnn_graph(device_name.c_str(), false);
g_qnn_mgr[device].instance = instance;
g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface();
g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface();

View file

@ -6,8 +6,8 @@ set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
#set to ON if target Android phone is based on Qualcomm Snapdragon 8 Gen 3
set(TARGET_SNAPDRAGON_8_GEN3 OFF)
#set to OFF if target Android phone is not equipped with Qualcomm Snapdragon 8 Gen 3
set(TARGET_SNAPDRAGON_8_GEN3 ON)
set(QNN_INC_PATH ${QNN_SDK_PATH}/include/QNN)
set(QNN_LIB_PATH ${QNN_SDK_PATH}/lib/aarch64-android)
@ -35,6 +35,8 @@ add_definitions(-DGGML_USE_QNN)
if(CMAKE_BUILD_TYPE STREQUAL "Release")
add_definitions(-DNDEBUG)
add_definitions(-O3)
else()
add_definitions(-O3)
endif()
if (TARGET_SNAPDRAGON_8_GEN3)
@ -44,7 +46,7 @@ add_definitions(-mcpu=cortex-x1)
add_definitions(-mtune=cortex-x1)
else()
# the below build optimization might be works well on ALL mainstream Android phone based on Qualcomm mobile SoC
# the below build optimization might be works well on ALL Android phone equipped with Qualcomm mainstream mobile SoC
add_definitions(-mcpu=cortex-a72)
endif()

View file

@ -415,7 +415,8 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) {
sizex = ggml_blck_size(qtype) * 2;
}
}
QNN_LOG_DEBUG("sizex %d\n", sizex);
QNN_LOG_DEBUG("sizex: %d\n", sizex);
QNN_LOG_DEBUG("sizey: %d\n", sizey);
if (n_ggml_op_type == GGML_OP_MUL) {
src0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);