ggml-qnn: refine ggml inference using QNN NPU
This commit is contained in:
parent
5f8cfe4a1e
commit
5269e082aa
3 changed files with 156 additions and 160 deletions
256
ggml-qnn.cpp
256
ggml-qnn.cpp
|
@ -55,6 +55,7 @@
|
|||
#include "Saver/QnnSaver.h"
|
||||
#include "System/QnnSystemInterface.h"
|
||||
#include "HTP/QnnHtpDevice.h"
|
||||
#include <HTP/QnnHtpGraph.h>
|
||||
|
||||
// =================================================================================================
|
||||
//
|
||||
|
@ -72,9 +73,16 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor);
|
|||
// self-defined macro / data structure
|
||||
//
|
||||
// =================================================================================================
|
||||
#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend
|
||||
#ifdef NDEBUG
|
||||
#define ENABLE_QNNBACKEND_DEBUG 0 // for troubleshooting QNN backend
|
||||
#define ENABLE_QNNSDK_LOG 0 // enable/disable QNN SDK's internal log
|
||||
#define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info
|
||||
#else
|
||||
#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend
|
||||
#define ENABLE_QNNSDK_LOG 1 // enable/disable QNN SDK's internal log
|
||||
#define ENABLE_QNNBACKEND_PERF 1 // enable/disable op's perf info
|
||||
#endif
|
||||
|
||||
#define QNN_LOGBUF_LEN 4096
|
||||
#define QNN_BACKEND_NAME "qnn"
|
||||
|
||||
|
@ -393,7 +401,6 @@ static void qnn_internal_log(ggml_log_level level, const char * file,
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
|
||||
const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) {
|
||||
|
@ -438,8 +445,8 @@ public:
|
|||
|
||||
void info() {
|
||||
_end_time = ggml_time_us();
|
||||
_duration = (_end_time - _begin_time) / 1000;
|
||||
QNN_LOG_DEBUG("duration of %s : %lld milliseconds\n", _perf_name.c_str(), _duration);
|
||||
_duration = (_end_time - _begin_time);
|
||||
QNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -473,15 +480,15 @@ enum qnn_sdk_profile_level {
|
|||
profile_detail = 2
|
||||
};
|
||||
|
||||
using _pfn_rpc_mem_init = void (*)(void);
|
||||
using _pfn_rpc_mem_deinit = void (*)(void);
|
||||
using _pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int);
|
||||
using _pfn_rpc_mem_free = void (*)(void *);
|
||||
using _pfn_rpc_mem_to_fd = int (*)(void *);
|
||||
using pfn_rpc_mem_init = void (*)(void);
|
||||
using pfn_rpc_mem_deinit = void (*)(void);
|
||||
using pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int);
|
||||
using pfn_rpc_mem_free = void (*)(void *);
|
||||
using pfn_rpc_mem_to_fd = int (*)(void *);
|
||||
|
||||
using _pfn_qnnsaver_initialize = decltype(QnnSaver_initialize);
|
||||
using _pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders);
|
||||
using _pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders);
|
||||
using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize);
|
||||
using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders);
|
||||
using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders);
|
||||
|
||||
#define QNN_VER_PTR(x) (&((x).v1))
|
||||
#define RPCMEM_DEFAULT_FLAGS 1
|
||||
|
@ -702,7 +709,7 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
|
|||
if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
|
||||
Qnn_QuantizeParams_t src_qparam_cpy = src_qparam;
|
||||
Qnn_AxisScaleOffset_t & axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding;
|
||||
Qnn_ScaleOffset_t **scaleOffset = & axis_scale_offset.scaleOffset;
|
||||
Qnn_ScaleOffset_t ** scaleOffset = & axis_scale_offset.scaleOffset;
|
||||
size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t);
|
||||
*scaleOffset = (Qnn_ScaleOffset_t *) malloc(scaleOffsetSize);
|
||||
memscpy(*scaleOffset, scaleOffsetSize,
|
||||
|
@ -733,7 +740,7 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
|
|||
uint32_t rank = QNN_TENSOR_GET_RANK(src);
|
||||
QNN_TENSOR_SET_RANK(dst, rank);
|
||||
size_t dim_size = rank * sizeof(uint32_t);
|
||||
uint32_t *dimensions = (uint32_t *) malloc(dim_size);
|
||||
uint32_t * dimensions = (uint32_t *) malloc(dim_size);
|
||||
if (dimensions == nullptr) {
|
||||
QNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying "
|
||||
"tensor %s\n",
|
||||
|
@ -1072,26 +1079,26 @@ class qnn_instance {
|
|||
QNN_LOG_DEBUG("load rpcmem lib successfully\n");
|
||||
set_rpcmem_initialized(true);
|
||||
}
|
||||
__pfn_rpc_mem_init = reinterpret_cast<_pfn_rpc_mem_init>(
|
||||
_pfn_rpc_mem_init = reinterpret_cast<pfn_rpc_mem_init>(
|
||||
dlsym(_rpc_lib_handle, "rpcmem_init"));
|
||||
__pfn_rpc_mem_deinit = reinterpret_cast<_pfn_rpc_mem_deinit>(
|
||||
_pfn_rpc_mem_deinit = reinterpret_cast<pfn_rpc_mem_deinit>(
|
||||
dlsym(_rpc_lib_handle, "rpcmem_deinit"));
|
||||
__pfn_rpc_mem_alloc = reinterpret_cast<_pfn_rpc_mem_alloc>(
|
||||
_pfn_rpc_mem_alloc = reinterpret_cast<pfn_rpc_mem_alloc>(
|
||||
dlsym(_rpc_lib_handle, "rpcmem_alloc"));
|
||||
__pfn_rpc_mem_free = reinterpret_cast<_pfn_rpc_mem_free>(
|
||||
_pfn_rpc_mem_free = reinterpret_cast<pfn_rpc_mem_free>(
|
||||
dlsym(_rpc_lib_handle, "rpcmem_free"));
|
||||
__pfn_rpc_mem_to_fd = reinterpret_cast<_pfn_rpc_mem_to_fd>(
|
||||
_pfn_rpc_mem_to_fd = reinterpret_cast<pfn_rpc_mem_to_fd>(
|
||||
dlsym(_rpc_lib_handle, "rpcmem_to_fd"));
|
||||
if (nullptr == __pfn_rpc_mem_alloc || nullptr == __pfn_rpc_mem_free ||
|
||||
nullptr == __pfn_rpc_mem_to_fd) {
|
||||
if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free ||
|
||||
nullptr == _pfn_rpc_mem_to_fd) {
|
||||
QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror());
|
||||
dlclose(_rpc_lib_handle);
|
||||
return 9;
|
||||
}
|
||||
|
||||
if (nullptr !=
|
||||
__pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy
|
||||
__pfn_rpc_mem_init();
|
||||
_pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy
|
||||
_pfn_rpc_mem_init();
|
||||
|
||||
std::vector<const QnnContext_Config_t *> temp_context_config;
|
||||
_qnn_interface.qnn_context_create(
|
||||
|
@ -1124,7 +1131,6 @@ class qnn_instance {
|
|||
}
|
||||
_qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
|
||||
|
||||
|
||||
//TODO: faster approach to probe the accurate capacity of rpc ion memory
|
||||
size_t candidate_size = 0;
|
||||
uint8_t * rpc_buffer = nullptr;
|
||||
|
@ -1145,6 +1151,16 @@ class qnn_instance {
|
|||
if (candidate_size > _rpcmem_capacity)
|
||||
_rpcmem_capacity = candidate_size;
|
||||
QNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity);
|
||||
|
||||
if (0 != init_htp_perfinfra()) {
|
||||
QNN_LOG_WARN("initialize HTP performance failure");
|
||||
}
|
||||
if (0 != set_rpc_polling()) {
|
||||
QNN_LOG_WARN("set RPC polling failure");
|
||||
}
|
||||
if (0 != set_high_performance_mode()) {
|
||||
QNN_LOG_WARN("set HTP high performance mode failure");
|
||||
}
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("leave qni_init\n");
|
||||
|
@ -1156,9 +1172,8 @@ class qnn_instance {
|
|||
int ret_status = 0;
|
||||
Qnn_ErrorHandle_t error = QNN_SUCCESS;
|
||||
|
||||
if (nullptr !=
|
||||
__pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy
|
||||
__pfn_rpc_mem_deinit();
|
||||
if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy
|
||||
_pfn_rpc_mem_deinit();
|
||||
|
||||
if (dlclose(_rpc_lib_handle) != 0) {
|
||||
QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror());
|
||||
|
@ -1325,6 +1340,8 @@ class qnn_instance {
|
|||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("failed to get qnn device infra\n");
|
||||
return 1;
|
||||
} else {
|
||||
QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n");
|
||||
}
|
||||
|
||||
QnnHtpDevice_Infrastructure_t * htp_infra = static_cast<QnnHtpDevice_Infrastructure_t *>(device_infra);
|
||||
|
@ -1333,6 +1350,11 @@ class qnn_instance {
|
|||
uint32_t device_id = 0;
|
||||
uint32_t core_id = 0;
|
||||
htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid);
|
||||
if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) {
|
||||
QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type", htp_infra->infraType);
|
||||
} else {
|
||||
QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType);
|
||||
}
|
||||
_qnn_htp_perfinfra = htp_perfinfra;
|
||||
_qnn_power_configid = power_configid;
|
||||
|
||||
|
@ -1343,14 +1365,17 @@ class qnn_instance {
|
|||
if (_qnn_rpc_pollingtime > 0) {
|
||||
QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingTime;
|
||||
memset(&rpc_pollingTime, 0, sizeof(rpc_pollingTime));
|
||||
rpc_pollingTime.option =
|
||||
QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
|
||||
rpc_pollingTime.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
|
||||
rpc_pollingTime.rpcPollingTimeConfig = _qnn_rpc_pollingtime;
|
||||
const QnnHtpPerfInfrastructure_PowerConfig_t * powerConfigs[] = {
|
||||
&rpc_pollingTime, nullptr};
|
||||
|
||||
QnnHtpPerfInfrastructure_PowerConfig_t rpc_ControlLatency;
|
||||
memset(&rpc_ControlLatency, 0, sizeof(rpc_ControlLatency));
|
||||
rpc_ControlLatency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY;
|
||||
rpc_ControlLatency.rpcControlLatencyConfig = 40;
|
||||
|
||||
const QnnHtpPerfInfrastructure_PowerConfig_t * powerConfigs[] = {&rpc_pollingTime, &rpc_ControlLatency, nullptr};
|
||||
if (_qnn_htp_perfinfra) {
|
||||
_qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid,
|
||||
powerConfigs);
|
||||
_qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
@ -1426,7 +1451,7 @@ class qnn_instance {
|
|||
}
|
||||
|
||||
auto allocate_bytes = static_cast<int32_t>(bytes + alignment);
|
||||
void * buf = __pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS,
|
||||
void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS,
|
||||
allocate_bytes);
|
||||
if (buf == nullptr) {
|
||||
QNN_LOG_WARN("failed to allocate rpc memory\n");
|
||||
|
@ -1439,7 +1464,7 @@ class qnn_instance {
|
|||
_rpcmem_store_map.insert(std::pair<void *, void *>(aligned_buf, buf)).second;
|
||||
if (!status) {
|
||||
QNN_LOG_WARN("failed to allocate rpc memory\n");
|
||||
__pfn_rpc_mem_free(buf);
|
||||
_pfn_rpc_mem_free(buf);
|
||||
}
|
||||
|
||||
return aligned_buf;
|
||||
|
@ -1451,7 +1476,7 @@ class qnn_instance {
|
|||
} else if (0 == _rpcmem_store_map.count(buf)) {
|
||||
QNN_LOG_WARN("no allocated tensor\n");
|
||||
} else {
|
||||
__pfn_rpc_mem_free(_rpcmem_store_map[buf]);
|
||||
_pfn_rpc_mem_free(_rpcmem_store_map[buf]);
|
||||
_rpcmem_store_map.erase(buf);
|
||||
}
|
||||
}
|
||||
|
@ -1461,7 +1486,7 @@ class qnn_instance {
|
|||
if (!is_rpcmem_initialized()) {
|
||||
QNN_LOG_WARN("rpc memory not initialized\n");
|
||||
} else {
|
||||
mem_fd = __pfn_rpc_mem_to_fd(buf);
|
||||
mem_fd = _pfn_rpc_mem_to_fd(buf);
|
||||
}
|
||||
|
||||
return mem_fd;
|
||||
|
@ -1560,7 +1585,7 @@ class qnn_instance {
|
|||
}
|
||||
|
||||
auto * get_providers =
|
||||
reinterpret_cast<_pfn_qnnsysteminterface_getproviders *>(
|
||||
reinterpret_cast<pfn_qnnsysteminterface_getproviders *>(
|
||||
dlsym(_system_lib_handle, "QnnSystemInterface_getProviders"));
|
||||
if (nullptr == get_providers) {
|
||||
QNN_LOG_WARN(
|
||||
|
@ -1661,7 +1686,7 @@ class qnn_instance {
|
|||
return 1;
|
||||
}
|
||||
|
||||
auto get_providers = load_qnn_functionpointers<_pfn_qnninterface_getproviders *>(
|
||||
auto get_providers = load_qnn_functionpointers<pfn_qnninterface_getproviders *>(
|
||||
lib_handle, "QnnInterface_getProviders");
|
||||
if (nullptr == get_providers) {
|
||||
QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s",
|
||||
|
@ -1805,11 +1830,11 @@ class qnn_instance {
|
|||
|
||||
void * _rpc_lib_handle = nullptr;
|
||||
std::atomic_bool _rpcmem_initialized{false};
|
||||
_pfn_rpc_mem_alloc __pfn_rpc_mem_alloc;
|
||||
_pfn_rpc_mem_free __pfn_rpc_mem_free;
|
||||
_pfn_rpc_mem_to_fd __pfn_rpc_mem_to_fd;
|
||||
_pfn_rpc_mem_init __pfn_rpc_mem_init;
|
||||
_pfn_rpc_mem_deinit __pfn_rpc_mem_deinit;
|
||||
pfn_rpc_mem_alloc _pfn_rpc_mem_alloc;
|
||||
pfn_rpc_mem_free _pfn_rpc_mem_free;
|
||||
pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd;
|
||||
pfn_rpc_mem_init _pfn_rpc_mem_init;
|
||||
pfn_rpc_mem_deinit _pfn_rpc_mem_deinit;
|
||||
std::unordered_map<void *, void *> _rpcmem_store_map;
|
||||
size_t _rpcmem_capacity = 512;
|
||||
|
||||
|
@ -1824,101 +1849,63 @@ class qnn_instance {
|
|||
static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx,
|
||||
const struct ggml_tensor * tensor,
|
||||
bool b_dump_tensor_info) {
|
||||
// only support the following 3 OPs currently
|
||||
// provide a GENERAL approach could fix this problem in a standalone PR of refine ggml backend
|
||||
// subsystem for mixed inference between CPU&GPU / CPU&NPU easily for ANY ggml backends
|
||||
// which the backend's ggml_backend_xxx_buffer_is_host return true.
|
||||
// this approach could be found:
|
||||
// https://github.com/ggerganov/llama.cpp/pull/7641
|
||||
bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL)
|
||||
|| (tensor->op == GGML_OP_MUL_MAT));
|
||||
if (!supported_op) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const struct ggml_tensor * src0 = tensor->src[0];
|
||||
const struct ggml_tensor * src1 = tensor->src[1];
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
const int64_t ne20 = tensor->ne[0];
|
||||
const int64_t ne21 = tensor->ne[1];
|
||||
|
||||
//TODO: support other quantized data type
|
||||
if (ggml_is_quantized(src0->type)) {
|
||||
if ((src0->type != GGML_TYPE_Q8_0) && (src0->type != GGML_TYPE_Q4_0)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (b_dump_tensor_info) {
|
||||
if (tensor->op == GGML_OP_MUL_MAT) {
|
||||
QNN_LOG_DEBUG("GGML_OP_MUL_MAT");
|
||||
QNN_LOG_DEBUG("op name:%s, tensor type:%s",
|
||||
ggml_op_name(tensor->op),
|
||||
ggml_type_name(tensor->type));
|
||||
QNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type));
|
||||
QNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type));
|
||||
QNN_LOG_DEBUG("src0 %15s: type = %i (%5s) ne = %5" PRIi64
|
||||
" x %5" PRIi64 " x %5" PRIi64
|
||||
", nb = (%5zi, %5zi, %5zi)\n",
|
||||
src0->name, src0->type, ggml_type_name(src0->type),
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0],
|
||||
src0->nb[1], src0->nb[2]);
|
||||
QNN_LOG_DEBUG("src1 %15s: type = %i (%5s) ne = %5" PRIi64
|
||||
" x %5" PRIi64 " x %5" PRIi64
|
||||
", nb = (%5zi, %5zi, %5zi)\n",
|
||||
src1->name, src1->type, ggml_type_name(src1->type),
|
||||
src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0],
|
||||
src1->nb[1], src1->nb[2]);
|
||||
QNN_LOG_DEBUG(
|
||||
" %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
|
||||
" x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
||||
tensor->name, tensor->type, ggml_type_name(tensor->type),
|
||||
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0],
|
||||
tensor->nb[1], tensor->nb[2]);
|
||||
}
|
||||
}
|
||||
|
||||
if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE ||
|
||||
tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW ||
|
||||
tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// make qnn_get_ggml_tensor_rank and QNN SDK happy
|
||||
if ((ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1)) {
|
||||
const struct ggml_tensor * src0 = tensor->src[0];
|
||||
const struct ggml_tensor * src1 = tensor->src[1];
|
||||
if (nullptr == src0 || nullptr == src1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int qtype = src0->type;
|
||||
if (tensor->op == GGML_OP_ADD) {
|
||||
return (qtype == GGML_TYPE_F32 || qtype == GGML_TYPE_F16 ||
|
||||
qtype == GGML_TYPE_Q8_0) &&
|
||||
(src1->type == GGML_TYPE_F32);
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
|
||||
// make qnn_get_ggml_tensor_rank and QNN SDK happy
|
||||
if (ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// TODO: support other GGML OPs using QNN API
|
||||
// a GENERAL approach could fix this problem in a standalone PR of refine ggml backend
|
||||
// subsystem for mixed inference between CPU&GPU / CPU&NPU easily for ANY ggml backends
|
||||
// which the backend's ggml_backend_xxx_buffer_is_host return true.
|
||||
// this approach could be found:
|
||||
// https://github.com/ggerganov/llama.cpp/pull/7641
|
||||
bool supported_op = false;
|
||||
supported_op = (tensor->op == GGML_OP_ADD);
|
||||
supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT));
|
||||
if (!supported_op) {
|
||||
return false;
|
||||
}
|
||||
|
||||
//TODO: support other quantized data type
|
||||
if (ggml_is_quantized(src0->type)) {
|
||||
if (src0->type != GGML_TYPE_Q8_0 && src0->type != GGML_TYPE_Q4_0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
int qtype = src0->type;
|
||||
if (tensor->op == GGML_OP_MUL) {
|
||||
return (qtype == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
if (tensor->op == GGML_OP_MUL_MAT) {
|
||||
if (ctx->device == QNN_BACKEND_GGML) {
|
||||
return (ne00 == ne10) && (src1->ne[2] % src0->ne[2] == 0) &&
|
||||
(src1->ne[3] % src0->ne[3] == 0);
|
||||
}
|
||||
if ((ctx->device == QNN_BACKEND_NPU) && (qtype == GGML_TYPE_Q8_0) &&
|
||||
(src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32)) {
|
||||
if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) {
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
if (ctx->device == QNN_BACKEND_CPU || ctx->device == QNN_BACKEND_GPU) {
|
||||
return (ne00 == ne10) && (ne00 == ne01);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
|
||||
const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
|
@ -1978,10 +1965,25 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
|
|||
if (!graph_initialized) {
|
||||
graph_name = graph_name + "_" + std::to_string(ctx->threads) +
|
||||
src0->name + "_" + src1->name;
|
||||
QNN_LOG_DEBUG("graph name %s", graph_name.c_str());
|
||||
QNN_LOG_INFO("graph name %s", graph_name.c_str());
|
||||
if (ctx->device == QNN_BACKEND_NPU) {
|
||||
QnnHtpGraph_CustomConfig_t custom_config;
|
||||
custom_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
|
||||
custom_config.numHvxThreads = 8;
|
||||
|
||||
QnnGraph_Config_t graph_config;
|
||||
graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_config.customConfig = &custom_config;
|
||||
const QnnGraph_Config_t * p_graphconfig[] = {&graph_config, NULL};
|
||||
error = qnn_raw_interface.graphCreate(
|
||||
instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig,
|
||||
&graph_handle);
|
||||
} else {
|
||||
error = qnn_raw_interface.graphCreate(
|
||||
instance->get_qnn_context_handle(), graph_name.c_str(), nullptr,
|
||||
&graph_handle);
|
||||
}
|
||||
|
||||
if (QNN_SUCCESS != error) {
|
||||
QNN_LOG_INFO("can't create qnn graph handle with graph name %s, "
|
||||
"error = %d\n",
|
||||
|
@ -2112,8 +2114,6 @@ failure:
|
|||
dst->name, dst->type, ggml_type_name(dst->type),
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
|
||||
dst->nb[1], dst->nb[2]);
|
||||
QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2],
|
||||
src0->ne[3]);
|
||||
}
|
||||
|
||||
QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions;
|
||||
|
@ -2198,7 +2198,7 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
|||
if (!graph_initialized) {
|
||||
graph_name = graph_name + "_" + std::to_string(ctx->threads) +
|
||||
src0->name + "_" + src1->name;
|
||||
QNN_LOG_DEBUG("graph name %s", graph_name.c_str());
|
||||
QNN_LOG_INFO("graph name %s", graph_name.c_str());
|
||||
error = qnn_raw_interface.graphCreate(
|
||||
instance->get_qnn_context_handle(), graph_name.c_str(), nullptr,
|
||||
&graph_handle);
|
||||
|
@ -2331,8 +2331,6 @@ failure:
|
|||
" x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
||||
dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0],
|
||||
dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]);
|
||||
QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2],
|
||||
src0->ne[3]);
|
||||
}
|
||||
|
||||
QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions;
|
||||
|
@ -2894,7 +2892,6 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t
|
|||
Qnn_ErrorHandle_t error = QNN_SUCCESS;
|
||||
ggml_backend_qnn_buffer_context * ctx =
|
||||
(ggml_backend_qnn_buffer_context *) buffer->context;
|
||||
|
||||
static int idx = 0;
|
||||
char tensor_name[GGML_MAX_NAME] = {0};
|
||||
snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++);
|
||||
|
@ -3061,7 +3058,7 @@ GGML_CALL static const char * ggml_backend_qnn_name(ggml_backend_t backend) {
|
|||
GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) {
|
||||
QNN_LOG_INFO("enter %s", __func__);
|
||||
ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context;
|
||||
QNN_LOG_DEBUG("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name);
|
||||
QNN_LOG_INFO("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name);
|
||||
|
||||
qnn_instance * instance = (qnn_instance *)g_qnn_mgr[ctx->device].instance;
|
||||
if (instance != nullptr) {
|
||||
|
@ -3073,7 +3070,7 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) {
|
|||
auto & graph_item = graph_it->second;
|
||||
Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item);
|
||||
GGML_UNUSED(graph_handle);
|
||||
QNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str());
|
||||
QNN_LOG_INFO("graph type:%s", graph_it->first.c_str());
|
||||
}
|
||||
instance->_qnn_graph_map.clear();
|
||||
|
||||
|
@ -3104,7 +3101,7 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe
|
|||
params.type = GGML_TASK_TYPE_COMPUTE;
|
||||
params.ith = 0;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor *node = cgraph->nodes[i];
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE ||
|
||||
node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW ||
|
||||
node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
||||
|
@ -3213,7 +3210,6 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) {
|
|||
device, GGML_QNN_MAX_DEVICES - 1);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES];
|
||||
static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES];
|
||||
static bool ggml_backend_qnn_buffer_type_initialized = false;
|
||||
|
|
|
@ -12,6 +12,8 @@ ANDROID_PLATFORM=android-34
|
|||
|
||||
GGML_QNN_UT=ggml-qnn-ut
|
||||
REMOTE_PATH=/data/local/tmp/
|
||||
BUILDTYPE=Debug
|
||||
BUILDTYPE=Release
|
||||
|
||||
|
||||
function dump_vars()
|
||||
|
@ -70,7 +72,7 @@ function check_and_download_ndk()
|
|||
|
||||
function build_arm64
|
||||
{
|
||||
cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${GGML_QNN_UT} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH}
|
||||
cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${GGML_QNN_UT} -DCMAKE_BUILD_TYPE=${BUILDTYPE} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH}
|
||||
|
||||
cd ./out/arm64-v8a
|
||||
make
|
||||
|
@ -166,9 +168,9 @@ function show_usage()
|
|||
echo "Usage:"
|
||||
echo " $0 build (build Android command line UT program)"
|
||||
echo " $0 updateqnnlibs (upload the latest QNN libs to Android phone)"
|
||||
echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)"
|
||||
echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)"
|
||||
echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)"
|
||||
echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)"
|
||||
echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)"
|
||||
echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)"
|
||||
echo -e "\n\n\n"
|
||||
}
|
||||
|
||||
|
|
|
@ -72,14 +72,12 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const
|
|||
int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line);
|
||||
int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args);
|
||||
if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) {
|
||||
//for Android command line application or WoA
|
||||
printf("%s\n", s_ggml_qnn_log_internal_buf);
|
||||
}
|
||||
va_end(args);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static const char * get_qnn_backend_name(int n_backend_type) {
|
||||
switch (n_backend_type) {
|
||||
case 0:
|
||||
|
@ -95,7 +93,6 @@ static const char * get_qnn_backend_name(int n_backend_type) {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
static bool ggml_graph_compute_helper(
|
||||
struct ggml_backend * backend,
|
||||
struct ggml_cgraph * graph,
|
||||
|
@ -123,26 +120,25 @@ static bool ggml_graph_compute_helper(
|
|||
}
|
||||
#endif
|
||||
|
||||
//a new approch of mixed inference
|
||||
if (nullptr != backend)
|
||||
return ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS;
|
||||
else
|
||||
return ggml_graph_compute(graph, &plan);
|
||||
}
|
||||
|
||||
|
||||
#define QK8_0 32
|
||||
|
||||
typedef struct {
|
||||
uint16_t d; // delta
|
||||
int8_t qs[QK8_0]; // quants
|
||||
} block_q8_0;
|
||||
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(uint16_t h) {
|
||||
__fp16 tmp;
|
||||
memcpy(&tmp, &h, sizeof(uint16_t));
|
||||
return (float)tmp;
|
||||
}
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
|
||||
static void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
||||
|
@ -245,7 +241,6 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
static uint32_t get_tensor_rank(const ggml_tensor * tensor) {
|
||||
uint32_t rank = 0;
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
|
@ -256,7 +251,6 @@ static uint32_t get_tensor_rank(const ggml_tensor * tensor) {
|
|||
return rank;
|
||||
}
|
||||
|
||||
|
||||
static uint32_t get_tensor_data_size(const ggml_tensor * tensor) {
|
||||
size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]);
|
||||
size_t n_dims = get_tensor_rank(tensor);
|
||||
|
@ -270,7 +264,6 @@ static uint32_t get_tensor_data_size(const ggml_tensor * tensor) {
|
|||
return ggml_nbytes(tensor);
|
||||
}
|
||||
|
||||
|
||||
//ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20
|
||||
static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
|
||||
// static RNG initialization (revisit if n_threads stops being constant)
|
||||
|
@ -305,8 +298,11 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
|||
t.join();
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
|
||||
//ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
|
||||
#ifdef GGML_USE_QNN
|
||||
memcpy((char*)tensor->data, data.data(), size * sizeof(float));
|
||||
#else
|
||||
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
|
||||
#endif
|
||||
} else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
|
||||
GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
|
||||
std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
|
||||
|
@ -321,18 +317,23 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
|||
}
|
||||
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
|
||||
GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
|
||||
//ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
||||
#ifdef GGML_USE_QNN
|
||||
memcpy((char*)tensor->data, dataq.data(), dataq.size());
|
||||
#else
|
||||
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
||||
#endif
|
||||
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
||||
// This is going to create some weird integers though.
|
||||
//ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
|
||||
#ifdef GGML_USE_QNN
|
||||
memcpy((char*)tensor->data, data.data(), ggml_nbytes(tensor));
|
||||
#else
|
||||
ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
|
||||
#endif
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L310
|
||||
static void initialize_tensors(ggml_context * ctx) {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||
|
@ -340,19 +341,17 @@ static void initialize_tensors(ggml_context * ctx) {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
static void show_usage() {
|
||||
printf(" " \
|
||||
"\nUsage: test_qnn_ops [options]\n" \
|
||||
"\n" \
|
||||
"Options:\n" \
|
||||
" -t GGML_OP_ADD / GGML_OP_MUL / GGML_OP_MULMAT\n" \
|
||||
" -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU)\n" \
|
||||
" -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(ggml)\n" \
|
||||
" ?/h print usage infomation\n\n"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) {
|
||||
int64_t n_begin_time = 0LL;
|
||||
int64_t n_end_time = 0LL;
|
||||
|
@ -370,15 +369,14 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) {
|
|||
ggml_backend_buffer_t buffer= nullptr;
|
||||
|
||||
ggml_type qtype = GGML_TYPE_I8;
|
||||
qtype = GGML_TYPE_F32;
|
||||
qtype = GGML_TYPE_F16;
|
||||
qtype = GGML_TYPE_Q8_0;
|
||||
qtype = GGML_TYPE_F32;
|
||||
|
||||
std::vector<uint8_t> work_buffer;
|
||||
QNN_LOG_DEBUG("enter qnn_ggml_op\n");
|
||||
QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type));
|
||||
|
||||
|
||||
n_begin_time = ggml_time_us();
|
||||
srand(time(NULL));
|
||||
|
||||
|
@ -473,7 +471,6 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) {
|
|||
initialize_tensors(ctx);
|
||||
}
|
||||
ggml_set_f32(src1, (rand() % 100 + 1));
|
||||
//ggml_set_f32(dst, 0.0f);
|
||||
}
|
||||
|
||||
ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr);
|
||||
|
@ -501,13 +498,13 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) {
|
|||
ggml_free(ctx);
|
||||
ggml_backend_buffer_free(buffer);
|
||||
ggml_backend_free(backend);
|
||||
|
||||
n_end_time = ggml_time_us();
|
||||
n_duration = (n_end_time - n_begin_time) / 1000;
|
||||
QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
int num_threads = 4;
|
||||
int n_backend_type = QNN_BACKEND_CPU;
|
||||
|
@ -531,7 +528,7 @@ int main(int argc, char * argv[]) {
|
|||
} else if (0 == strcmp(argv[i], "-b")) {
|
||||
if (i + 1 < argc) {
|
||||
int backend = atoi(argv[i + 1]);
|
||||
if (backend <= QNN_BACKEND_NPU)
|
||||
if (backend <= QNN_BACKEND_GGML)
|
||||
n_backend_type = backend;
|
||||
else {
|
||||
show_usage();
|
||||
|
@ -549,5 +546,6 @@ int main(int argc, char * argv[]) {
|
|||
QNN_LOG_DEBUG("backend %d, ggml op:%d(%s)", n_backend_type, n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type));
|
||||
qnn_op_ut(num_threads, n_backend_type, n_ggml_op_type);
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue