add supportive of quantize data type Q8_0
This commit is contained in:
parent
926a8661f3
commit
dd29834c11
4 changed files with 379 additions and 229 deletions
176
ggml-qnn.cpp
176
ggml-qnn.cpp
|
@ -72,8 +72,6 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const
|
|||
#define RPCMEM_DEFAULT_FLAGS 1
|
||||
#define RPCMEM_HEAP_ID_SYSTEM 25
|
||||
|
||||
#define GGML_DUMP_TENSOR(tensor) ggml_tensor_dump(tensor, #tensor)
|
||||
|
||||
#define GGML_QNN_LOGBUF_LEN 4096
|
||||
|
||||
#define GGML_QNN_DEBUG 1 //for troubleshooting QNN backend
|
||||
|
@ -195,8 +193,17 @@ static ggml_backend_t g_qnn_backend = nullptr;
|
|||
|
||||
static int g_current_device = QNN_BACKEND_GGML;
|
||||
|
||||
|
||||
//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently
|
||||
//according to the QNN SDK Reference Guide,
|
||||
//CPU - Choose a non-quantized model. Quantized models are currently incompatible with the CPU backend
|
||||
//GPU - Choose a non-quantized model. Quantized models are currently incompatible with the GPU backend
|
||||
//HTP - Choose a quantized model. Quantized models are required when running on the HTP backend
|
||||
//DSP - Choose a quantized model. Quantized models are required when running on the DSP backend
|
||||
//HTA - Choose a quantized model. Quantized models are required when running on the HTA backend
|
||||
//
|
||||
//only focus on Qualcomm CPU/GPU/NPU backend in this implementation of QNN backend for ggml currently
|
||||
//Qualcomm CPU: Qualcomm Kryo CPU
|
||||
//Qualcomm GPU: Qualcomm Adreno GPU
|
||||
//Qualcomm NPU: aka HTP(Hexagon Tensor Processor), ~= cDSP(Compute DSP) + HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator)
|
||||
static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
|
||||
[QNN_BACKEND_CPU] = {.device = 0, .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}},
|
||||
[QNN_BACKEND_GPU] = {.device = 1, .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}},
|
||||
|
@ -849,6 +856,10 @@ static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
|
|||
return QNN_DATATYPE_FLOAT_16;
|
||||
case GGML_TYPE_F32:
|
||||
return QNN_DATATYPE_FLOAT_32;
|
||||
case GGML_TYPE_I8:
|
||||
return QNN_DATATYPE_INT_8;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return QNN_DATATYPE_SFIXED_POINT_8;
|
||||
default:
|
||||
break;
|
||||
|
||||
|
@ -903,14 +914,8 @@ static const char * get_qnn_backend_name(int n_backend_type) {
|
|||
case 2:
|
||||
return "QNN-NPU";
|
||||
case 3:
|
||||
return "ggml"; //the default GGML backend, used to compare performance between QNN backend and the default GGML backend
|
||||
return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML
|
||||
|
||||
#if 0 //QNN cDSP and HTA backend would not be used currently, focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently
|
||||
case 3:
|
||||
return "QNN-cDSP";
|
||||
case 4:
|
||||
return "QNN-HTA";
|
||||
#endif
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
|
@ -1720,7 +1725,7 @@ static void ggml_qnn_logcallback(const char * fmt,
|
|||
|
||||
double ms = (double) timestamp / 1000000.0;
|
||||
|
||||
{
|
||||
if (0) {
|
||||
std::lock_guard<std::mutex> lock(log_mutex);
|
||||
|
||||
memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN);
|
||||
|
@ -1770,7 +1775,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
|
|||
_qnn_raw_interface.logCreate(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle);
|
||||
#endif
|
||||
if (nullptr == _qnn_log_handle) {
|
||||
QNN_LOG_WARN("why failed to initialize qnn log\n"); //DSP backend not work on Qualcomm SoC based low-end phone
|
||||
QNN_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone
|
||||
return 4;
|
||||
} else {
|
||||
QNN_LOG_DEBUG("initialize qnn log successfully\n");
|
||||
|
@ -2010,14 +2015,14 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum
|
|||
const struct ggml_tensor * src0 = tensor->src[0];
|
||||
const struct ggml_tensor * src1 = tensor->src[1];
|
||||
|
||||
const int64_t ne00 = tensor->src[0]->ne[0];
|
||||
const int64_t ne01 = tensor->src[0]->ne[1];
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
|
||||
const int64_t ne10 = tensor->src[1]->ne[0];
|
||||
const int64_t ne11 = tensor->src[1]->ne[1];
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
|
||||
const int64_t ne0 = tensor->ne[0];
|
||||
const int64_t ne1 = tensor->ne[1];
|
||||
const int64_t ne0 = tensor->ne[0];
|
||||
const int64_t ne1 = tensor->ne[1];
|
||||
|
||||
GGML_UNUSED(ne0);
|
||||
GGML_UNUSED(ne1);
|
||||
|
@ -2057,30 +2062,15 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum
|
|||
return false;
|
||||
}
|
||||
|
||||
if (tensor->op == GGML_OP_ADD) {
|
||||
//TODO: this is limitation
|
||||
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
|
||||
&& (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
|
||||
&& (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16);
|
||||
|
||||
// GPU/NPU inference will slower then CPU inference when tensor->ne[1] < min batch size
|
||||
if (tensor->ne[1] < 32) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (tensor->op == GGML_OP_MUL_MAT) {
|
||||
//TODO: this is limitation
|
||||
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
|
||||
&& (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
|
||||
&& (src0->type == src1->type) && (src0->type == tensor->type);
|
||||
int qtype = src0->type;
|
||||
return (qtype == GGML_TYPE_F32 || qtype == GGML_TYPE_F16 || qtype == GGML_TYPE_Q8_0)
|
||||
&& (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
|
||||
|
||||
if (tensor->ne[1] < 32) { // GPU/NPU inference will slower then CPU inference when tensor->ne[1] < min batch size
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//TODO: this is limitation
|
||||
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
|
||||
&& (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
|
||||
&& (src0->type == src1->type) && (src0->type == tensor->type);
|
||||
}
|
||||
|
||||
|
||||
|
@ -2129,7 +2119,7 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|||
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
|
||||
|
||||
n_begin_time = ggml_time_us();
|
||||
#if 1
|
||||
|
||||
QNN_LOG_DEBUG("call %s\n", __func__);
|
||||
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
||||
src0->name,
|
||||
|
@ -2147,17 +2137,23 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|||
QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0));
|
||||
QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1));
|
||||
QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2));
|
||||
#endif
|
||||
|
||||
QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||
QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||
QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
|
||||
|
||||
src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type);
|
||||
src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type);
|
||||
dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type);
|
||||
src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type);
|
||||
src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type);
|
||||
dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type);
|
||||
|
||||
std::string map_entry = std::string(ggml_op_name(ggmlop));
|
||||
uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
|
||||
(uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
|
||||
uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
|
||||
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
|
||||
uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
|
||||
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
|
||||
|
||||
std::string map_entry = std::string(ggml_op_name(ggmlop));
|
||||
if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) {
|
||||
graph_initialized = true;
|
||||
auto & graph_item = instance->_qnn_graph_map[map_entry];
|
||||
|
@ -2197,6 +2193,16 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|||
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
|
||||
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
|
||||
|
||||
QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
|
||||
QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0);
|
||||
QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
|
||||
QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
|
||||
QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1);
|
||||
QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
|
||||
QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output;
|
||||
QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst);
|
||||
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
|
||||
|
||||
Qnn_Tensor_t tensor_inputs[] = {
|
||||
*tensor_0,
|
||||
*tensor_1
|
||||
|
@ -2245,6 +2251,11 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|||
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
|
||||
uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
|
||||
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
|
||||
|
||||
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
|
||||
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
|
||||
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
|
||||
|
||||
QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
|
||||
QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0);
|
||||
QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
|
||||
|
@ -2255,10 +2266,6 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|||
QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst);
|
||||
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
|
||||
|
||||
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
|
||||
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
|
||||
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
|
||||
|
||||
Qnn_Tensor_t tensor_inputs[] = {
|
||||
*tensor_0,
|
||||
*tensor_1
|
||||
|
@ -2337,7 +2344,6 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1,
|
|||
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
|
||||
|
||||
n_begin_time = ggml_time_us();
|
||||
#if 1
|
||||
QNN_LOG_DEBUG("call %s\n", __func__);
|
||||
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
||||
src0->name,
|
||||
|
@ -2355,17 +2361,23 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1,
|
|||
QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0));
|
||||
QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1));
|
||||
QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2));
|
||||
#endif
|
||||
|
||||
QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||
QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||
QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
|
||||
|
||||
src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type);
|
||||
src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type);
|
||||
dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type);
|
||||
src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type);
|
||||
src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type);
|
||||
dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type);
|
||||
|
||||
std::string map_entry = std::string(ggml_op_name(ggmlop));
|
||||
uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
|
||||
(uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
|
||||
uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
|
||||
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
|
||||
uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
|
||||
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
|
||||
|
||||
std::string map_entry = std::string(ggml_op_name(ggmlop));
|
||||
if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) {
|
||||
graph_initialized = true;
|
||||
auto & graph_item = instance->_qnn_graph_map[map_entry];
|
||||
|
@ -2401,6 +2413,16 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1,
|
|||
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
|
||||
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
|
||||
|
||||
QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
|
||||
QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0);
|
||||
QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
|
||||
QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
|
||||
QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1);
|
||||
QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
|
||||
QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output;
|
||||
QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst);
|
||||
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
|
||||
|
||||
Qnn_Tensor_t tensor_inputs[] = {
|
||||
*tensor_0,
|
||||
*tensor_1
|
||||
|
@ -2543,7 +2565,7 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr
|
|||
}
|
||||
|
||||
n_begin_time = ggml_time_us();
|
||||
#if 1
|
||||
|
||||
QNN_LOG_DEBUG("call %s\n", __func__);
|
||||
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
||||
src0->name,
|
||||
|
@ -2561,11 +2583,17 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr
|
|||
QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0));
|
||||
QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1));
|
||||
QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2));
|
||||
#endif
|
||||
|
||||
QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||
QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||
QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
|
||||
uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
|
||||
(uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
|
||||
uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
|
||||
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
|
||||
uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
|
||||
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
|
||||
|
||||
|
||||
std::string map_entry = std::string(ggml_op_name(ggmlop));
|
||||
if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) {
|
||||
|
@ -2606,6 +2634,16 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr
|
|||
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
|
||||
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
|
||||
|
||||
QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
|
||||
QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0);
|
||||
QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
|
||||
QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
|
||||
QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1);
|
||||
QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
|
||||
QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output;
|
||||
QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst);
|
||||
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
|
||||
|
||||
Qnn_Tensor_t tensor_inputs[] = {
|
||||
*tensor_0,
|
||||
*tensor_1
|
||||
|
@ -3125,10 +3163,9 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t
|
|||
char tensor_name[GGML_MAX_NAME] = { 0 };
|
||||
snprintf(tensor_name, GGML_MAX_NAME, "tensor_%2d", idx++);
|
||||
|
||||
uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
|
||||
//TODO:only support FP32 & FP16
|
||||
Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
|
||||
Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||
uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
|
||||
Qnn_DataType_t qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type);
|
||||
Qnn_TensorType_t qnn_tensor_type= QNN_TENSOR_TYPE_APP_WRITE;
|
||||
|
||||
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||
|
@ -3365,7 +3402,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const
|
|||
|
||||
//note: this function be used with proposal/refined ggml backend subsystem in this PR:
|
||||
// https://github.com/ggerganov/llama.cpp/pull/7641
|
||||
// new ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true)
|
||||
// any ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true)
|
||||
// can following this style for mixed inference between CPU&GPU / CPU&NPU very easily
|
||||
GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) {
|
||||
GGML_UNUSED(backend);
|
||||
|
@ -3481,7 +3518,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) {
|
|||
|
||||
/**
|
||||
*
|
||||
* @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU(aka HTP/DSP)
|
||||
* @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU
|
||||
* @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer
|
||||
* @return
|
||||
*/
|
||||
|
@ -3516,22 +3553,21 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
|
|||
(path +
|
||||
":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(),
|
||||
1)) {
|
||||
QNN_LOG_INFO("QNN DSP backend setenv successfully");
|
||||
QNN_LOG_INFO("QNN NPU backend setenv successfully");
|
||||
} else {
|
||||
QNN_LOG_ERROR("QNN DSP backend setenv failure");
|
||||
QNN_LOG_ERROR("QNN NPU backend setenv failure");
|
||||
}
|
||||
if (0 == setenv("ADSP_LIBRARY_PATH",
|
||||
(path +
|
||||
";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(),
|
||||
1)) {
|
||||
QNN_LOG_INFO("QNN DSP backend setenv successfully");
|
||||
QNN_LOG_INFO("QNN NPU backend setenv successfully");
|
||||
} else {
|
||||
QNN_LOG_ERROR("QNN DSP backend setenv failure");
|
||||
QNN_LOG_ERROR("QNN NPU backend setenv failure");
|
||||
}
|
||||
} else {
|
||||
if (0 == setenv("LD_LIBRARY_PATH",
|
||||
(path +
|
||||
":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(),
|
||||
path.c_str(),
|
||||
1)) {
|
||||
QNN_LOG_INFO("%s backend setenv successfully\n", get_qnn_backend_name(device));
|
||||
} else {
|
||||
|
|
|
@ -10,19 +10,18 @@ extern "C" {
|
|||
|
||||
#define GGML_QNN_MAX_DEVICES 3
|
||||
|
||||
//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently
|
||||
enum QNNBackend {
|
||||
QNN_BACKEND_CPU,
|
||||
QNN_BACKEND_GPU,
|
||||
QNN_BACKEND_NPU,
|
||||
QNN_BACKEND_GGML, //"fake" QNN backend just for compare performance between QNN and original GGML
|
||||
QNN_BACKEND_GGML, //"fake" QNN backend, used for compare performance between QNN and original GGML
|
||||
};
|
||||
|
||||
GGML_API int ggml_backend_qnn_reg_devices(void);
|
||||
|
||||
/**
|
||||
*
|
||||
* @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU(aka HTP/DSP)
|
||||
* @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU
|
||||
* @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer
|
||||
* @return
|
||||
*/
|
||||
|
|
|
@ -4,7 +4,8 @@ set -e
|
|||
|
||||
#https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct
|
||||
#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
|
||||
QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/
|
||||
#QNN SDK released on 20240531
|
||||
QNN_SDK_PATH=/opt/qcom/aistack/qairt/2.23.0.240531/
|
||||
|
||||
ANDROID_NDK=`pwd`/android-ndk-r26c
|
||||
ANDROID_PLATFORM=android-34
|
||||
|
@ -89,6 +90,23 @@ function remove_temp_dir()
|
|||
}
|
||||
|
||||
|
||||
function update_qnn_libs()
|
||||
{
|
||||
check_qnn_sdk
|
||||
|
||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/
|
||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/
|
||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/
|
||||
|
||||
#the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully
|
||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/
|
||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/
|
||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/
|
||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/
|
||||
adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/
|
||||
}
|
||||
|
||||
|
||||
function check_qnn_libs()
|
||||
{
|
||||
#reuse the cached qnn libs in Android phone
|
||||
|
@ -96,16 +114,7 @@ function check_qnn_libs()
|
|||
if [ $? -eq 0 ]; then
|
||||
printf "QNN libs already exist on Android phone\n"
|
||||
else
|
||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/
|
||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/
|
||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/
|
||||
|
||||
#the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully
|
||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/
|
||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/
|
||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/
|
||||
adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/
|
||||
adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/
|
||||
update_qnn_libs
|
||||
fi
|
||||
}
|
||||
|
||||
|
@ -155,7 +164,8 @@ function run_ggml_qnn_ut()
|
|||
function show_usage()
|
||||
{
|
||||
echo "Usage:"
|
||||
echo " $0 build"
|
||||
echo " $0 build (build Android command line UT program)"
|
||||
echo " $0 updateqnnlibs (upload the latest QNN libs to Android phone)"
|
||||
echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)"
|
||||
echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)"
|
||||
echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)"
|
||||
|
@ -183,6 +193,9 @@ elif [ $# == 1 ]; then
|
|||
elif [ "$1" == "build" ]; then
|
||||
build_ggml_qnn_ut
|
||||
exit 0
|
||||
elif [ "$1" == "updateqnnlibs" ]; then
|
||||
update_qnn_libs
|
||||
exit 0
|
||||
else
|
||||
ggmlop=$1
|
||||
qnnbackend=0
|
||||
|
|
|
@ -87,7 +87,7 @@ static const char * get_qnn_backend_name(int n_backend_type) {
|
|||
case 1:
|
||||
return "QNN-GPU";
|
||||
case 2:
|
||||
return "QNN-NPU(HTP/DSP)";
|
||||
return "QNN-NPU";
|
||||
case 3:
|
||||
return "ggml";
|
||||
default:
|
||||
|
@ -131,9 +131,54 @@ static bool ggml_graph_compute_helper(
|
|||
}
|
||||
|
||||
|
||||
static void tensor_dump_elements(const ggml_tensor * tensor) {
|
||||
#define QK8_0 32
|
||||
typedef struct {
|
||||
uint16_t d; // delta
|
||||
int8_t qs[QK8_0]; // quants
|
||||
} block_q8_0;
|
||||
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(uint16_t h) {
|
||||
__fp16 tmp;
|
||||
memcpy(&tmp, &h, sizeof(uint16_t));
|
||||
return (float)tmp;
|
||||
}
|
||||
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
|
||||
static void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
||||
QNN_LOG_DEBUG("dump ggml tensor %s(%s): type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
||||
name, tensor->name,
|
||||
tensor->type, ggml_type_name(tensor->type),
|
||||
tensor->ne[0], tensor->ne[1], tensor->ne[2],
|
||||
tensor->nb[0], tensor->nb[1], tensor->nb[2]);
|
||||
|
||||
float value = 0;
|
||||
std::ostringstream tmposs;
|
||||
if (nullptr == tensor) {
|
||||
QNN_LOG_WARN("tensor is null");
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_I8) {
|
||||
for (int h = 0; h < tensor->ne[3]; h++) {
|
||||
for (int i = 0; i < tensor->ne[2]; i++) {
|
||||
for (int j = 0; j < tensor->ne[1]; j++) {
|
||||
for (int k = 0; k < tensor->ne[0]; k++) {
|
||||
value = ((int8_t *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] +
|
||||
j * tensor->ne[0] + k];
|
||||
tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value
|
||||
<< " ";
|
||||
}
|
||||
tmposs << "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) {
|
||||
QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str());
|
||||
tmposs.clear();
|
||||
tmposs.str("");
|
||||
}
|
||||
}
|
||||
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
for (int h = 0; h < tensor->ne[3]; h++) {
|
||||
for (int i = 0; i < tensor->ne[2]; i++) {
|
||||
|
@ -144,31 +189,59 @@ static void tensor_dump_elements(const ggml_tensor * tensor) {
|
|||
tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value
|
||||
<< " ";
|
||||
}
|
||||
if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) {
|
||||
QNN_LOG_DEBUG("%s", tmposs.str().c_str());
|
||||
}
|
||||
tmposs.clear();
|
||||
tmposs.str("");
|
||||
//QNN_LOG_DEBUG("\n");
|
||||
tmposs << "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) {
|
||||
QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str());
|
||||
tmposs.clear();
|
||||
tmposs.str("");
|
||||
}
|
||||
}
|
||||
|
||||
//QNN_LOG_DEBUG("\n");
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_F16) {
|
||||
for (int h = 0; h < tensor->ne[3]; h++) {
|
||||
for (int i = 0; i < tensor->ne[2]; i++) {
|
||||
for (int j = 0; j < tensor->ne[1]; j++) {
|
||||
for (int k = 0; k < tensor->ne[0]; k++) {
|
||||
unsigned short tmpvalue = ((unsigned short *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] +
|
||||
j * tensor->ne[0] + k];
|
||||
value = GGML_FP16_TO_FP32(tmpvalue);
|
||||
tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value
|
||||
<< " ";
|
||||
}
|
||||
tmposs << "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) {
|
||||
QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str());
|
||||
tmposs.clear();
|
||||
tmposs.str("");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
||||
QNN_LOG_DEBUG("dump ggml tensor %s(%s)", name, tensor->name);
|
||||
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)",
|
||||
name,
|
||||
tensor->type, ggml_type_name(tensor->type),
|
||||
tensor->ne[0], tensor->ne[1], tensor->ne[2],
|
||||
tensor->nb[0], tensor->nb[1], tensor->nb[2]);
|
||||
tensor_dump_elements(tensor);
|
||||
|
||||
QNN_LOG_DEBUG("\n");
|
||||
if (tensor->type == GGML_TYPE_Q8_0) {
|
||||
block_q8_0 * tmp = ((block_q8_0 *)tensor->data);
|
||||
for (int j = 0; j < tensor->ne[1]; j++) {
|
||||
int n = tensor->ne[0] / QK8_0; //blocks per row
|
||||
for (int z = 0; z < n; z++) {
|
||||
const float d = GGML_FP16_TO_FP32(tmp[ j * n + z ].d);
|
||||
for (int k = 0; k < QK8_0; k++) {
|
||||
value = tmp[j * n + z].qs[k] * d;
|
||||
tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value
|
||||
<< " ";
|
||||
}
|
||||
}
|
||||
tmposs << "\n";
|
||||
}
|
||||
if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) {
|
||||
QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str());
|
||||
tmposs.clear();
|
||||
tmposs.str("");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -231,7 +304,8 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
|||
t.join();
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
|
||||
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
|
||||
//ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
|
||||
memcpy((char*)tensor->data, data.data(), size * sizeof(float));
|
||||
} else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
|
||||
GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
|
||||
std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
|
||||
|
@ -246,10 +320,12 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
|||
}
|
||||
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
|
||||
GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
|
||||
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
||||
//ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
||||
memcpy((char*)tensor->data, dataq.data(), dataq.size());
|
||||
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
||||
// This is going to create some weird integers though.
|
||||
ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
|
||||
//ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
|
||||
memcpy((char*)tensor->data, data.data(), ggml_nbytes(tensor));
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
@ -276,16 +352,13 @@ static void show_usage() {
|
|||
}
|
||||
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) {
|
||||
int64_t n_begin_time = 0LL;
|
||||
int64_t n_end_time = 0LL;
|
||||
int64_t n_duration = 0LL;
|
||||
size_t ctx_size = 0;
|
||||
int sizey = 4;
|
||||
int sizex = 4;
|
||||
int num_threads = 4;
|
||||
int n_backend_type = QNN_BACKEND_CPU;
|
||||
int n_ggml_op_type = GGML_OP_ADD;
|
||||
|
||||
struct ggml_context * ctx = nullptr;
|
||||
struct ggml_cgraph * gf = nullptr;
|
||||
|
@ -294,8 +367,150 @@ int main(int argc, char * argv[]) {
|
|||
struct ggml_tensor * dst = nullptr;
|
||||
ggml_backend_t backend = nullptr;
|
||||
ggml_backend_buffer_t buffer= nullptr;
|
||||
ggml_type qtype = GGML_TYPE_F32;
|
||||
|
||||
ggml_type qtype = GGML_TYPE_I8;
|
||||
qtype = GGML_TYPE_F32;
|
||||
qtype = GGML_TYPE_F16;
|
||||
qtype = GGML_TYPE_Q8_0;
|
||||
|
||||
std::vector<uint8_t> work_buffer;
|
||||
QNN_LOG_DEBUG("enter qnn_ggml_op\n");
|
||||
QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type));
|
||||
|
||||
|
||||
n_begin_time = ggml_time_us();
|
||||
srand(time(NULL));
|
||||
|
||||
ctx_size += 1024 * 1024 * 32;
|
||||
QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size,
|
||||
(ctx_size / 1024 / 1024));
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx_size,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/* no_alloc =*/ 0
|
||||
};
|
||||
|
||||
if (n_backend_type != QNN_BACKEND_GGML) {
|
||||
params.no_alloc = true;
|
||||
backend = ggml_backend_qnn_init(n_backend_type, "/data/local/tmp/");
|
||||
if (nullptr == backend) {
|
||||
QNN_LOG_ERROR("create qnn backend %d(%s) failed\n", n_backend_type, get_qnn_backend_name(n_backend_type));
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
ctx = ggml_init(params);
|
||||
if (!ctx) {
|
||||
QNN_LOG_ERROR("%s: ggml_init() failed\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("creating new tensors\n");
|
||||
QNN_LOG_DEBUG("ggml_blck_size(%s) %d\n", ggml_type_name(qtype), ggml_blck_size(qtype));
|
||||
QNN_LOG_DEBUG("ggml_type_size(%s) %d\n", ggml_type_name(qtype), ggml_type_size(qtype));
|
||||
if (ggml_is_quantized(qtype)) {
|
||||
sizex = ggml_blck_size(qtype);
|
||||
|
||||
if (n_ggml_op_type == GGML_OP_MUL_MAT) {
|
||||
sizex = ggml_blck_size(qtype) * 2;
|
||||
}
|
||||
}
|
||||
QNN_LOG_DEBUG("sizex %d\n", sizex);
|
||||
|
||||
if (n_ggml_op_type == GGML_OP_MUL) {
|
||||
src0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
|
||||
src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
|
||||
} else {
|
||||
src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
||||
src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
|
||||
}
|
||||
ggml_set_input(src0);
|
||||
ggml_set_input(src1);
|
||||
|
||||
switch (n_ggml_op_type) {
|
||||
case GGML_OP_ADD:
|
||||
dst = ggml_add(ctx, src0, src1);
|
||||
break;
|
||||
case GGML_OP_MUL:
|
||||
dst = ggml_mul(ctx, src0, src1);
|
||||
break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
dst = ggml_mul_mat(ctx, src0, src1);
|
||||
break;
|
||||
default:
|
||||
QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type,
|
||||
ggml_op_name((enum ggml_op) n_ggml_op_type));
|
||||
ggml_free(ctx);
|
||||
ggml_backend_free(backend);
|
||||
return 3;
|
||||
}
|
||||
|
||||
ggml_set_output(dst);
|
||||
#ifdef GGML_USE_QNN
|
||||
if (n_backend_type != QNN_BACKEND_GGML) {
|
||||
buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
||||
if (!buffer) {
|
||||
QNN_LOG_ERROR("%s: failed to allocate backend buffer\n", __func__);
|
||||
ggml_free(ctx);
|
||||
ggml_backend_free(backend);
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
QNN_LOG_DEBUG("creating compute graph\n");
|
||||
gf = ggml_new_graph(ctx);
|
||||
ggml_build_forward_expand(gf, dst);
|
||||
|
||||
if (n_backend_type != QNN_BACKEND_GGML) {
|
||||
initialize_tensors(ctx);
|
||||
} else {
|
||||
if (qtype == GGML_TYPE_F32) {
|
||||
ggml_set_f32(src0, (rand() % 100 + 1));
|
||||
} else {
|
||||
initialize_tensors(ctx);
|
||||
}
|
||||
ggml_set_f32(src1, (rand() % 100 + 1));
|
||||
//ggml_set_f32(dst, 0.0f);
|
||||
}
|
||||
|
||||
ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr);
|
||||
|
||||
if (get_tensor_data_size(dst) < (32 * 32)) {
|
||||
QNN_LOG_DEBUG("dump tensors:\n");
|
||||
TENSOR_DUMP(src0);
|
||||
TENSOR_DUMP(src1);
|
||||
TENSOR_DUMP(dst);
|
||||
} else {
|
||||
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
||||
src0->name,
|
||||
src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
|
||||
src0->nb[0], src0->nb[1], src0->nb[2]);
|
||||
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
||||
src1->name,
|
||||
src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
|
||||
src1->nb[0], src1->nb[1], src1->nb[2]);
|
||||
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
||||
dst->name,
|
||||
dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
|
||||
dst->nb[1], dst->nb[2]);
|
||||
}
|
||||
|
||||
ggml_free(ctx);
|
||||
ggml_backend_buffer_free(buffer);
|
||||
ggml_backend_free(backend);
|
||||
n_end_time = ggml_time_us();
|
||||
n_duration = (n_end_time - n_begin_time) / 1000;
|
||||
QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
int num_threads = 4;
|
||||
int n_backend_type = QNN_BACKEND_CPU;
|
||||
int n_ggml_op_type = GGML_OP_ADD;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
if (0 == strcmp(argv[i], "-t")) {
|
||||
|
@ -330,121 +545,8 @@ int main(int argc, char * argv[]) {
|
|||
}
|
||||
|
||||
QNN_LOG_DEBUG("enter qnn_ggml_op\n");
|
||||
QNN_LOG_DEBUG("ggml op:%d(%s)", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type));
|
||||
|
||||
n_begin_time = ggml_time_us();
|
||||
srand(time(NULL));
|
||||
|
||||
ctx_size += 1024 * 1024 * 32;
|
||||
QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size,
|
||||
(ctx_size / 1024 / 1024));
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx_size,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/* no_alloc =*/ 0
|
||||
};
|
||||
|
||||
if (n_backend_type != QNN_BACKEND_GGML) {
|
||||
params.no_alloc = true;
|
||||
backend = ggml_backend_qnn_init(n_backend_type, "/data/local/tmp/");
|
||||
if (nullptr == backend) {
|
||||
QNN_LOG_ERROR("create qnn backend %d(%s) failed", n_backend_type, get_qnn_backend_name(n_backend_type));
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
ctx = ggml_init(params);
|
||||
if (!ctx) {
|
||||
QNN_LOG_ERROR("%s: ggml_init() failed\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("creating new tensors\n");
|
||||
QNN_LOG_DEBUG("ggml_blck_size(%s) %d", ggml_type_name(qtype), ggml_blck_size(qtype));
|
||||
QNN_LOG_DEBUG("ggml_type_size(%s) %d", ggml_type_name(qtype), ggml_type_size(qtype));
|
||||
if (qtype != GGML_TYPE_F32) {
|
||||
sizex = ggml_blck_size(qtype);
|
||||
}
|
||||
|
||||
src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
||||
ggml_set_input(src0);
|
||||
src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
|
||||
ggml_set_input(src1);
|
||||
|
||||
switch (n_ggml_op_type) {
|
||||
case GGML_OP_ADD:
|
||||
dst = ggml_add(ctx, src0, src1);
|
||||
break;
|
||||
case GGML_OP_MUL:
|
||||
dst = ggml_mul(ctx, src0, src1);
|
||||
break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
dst = ggml_mul_mat(ctx, src0, src1);
|
||||
break;
|
||||
default:
|
||||
QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type,
|
||||
ggml_op_name((enum ggml_op) n_ggml_op_type));
|
||||
ggml_free(ctx);
|
||||
ggml_backend_free(backend);
|
||||
return 3;
|
||||
}
|
||||
|
||||
ggml_set_output(dst);
|
||||
#ifdef GGML_USE_QNN
|
||||
if (n_backend_type != QNN_BACKEND_GGML) {
|
||||
buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
||||
if (!buffer) {
|
||||
QNN_LOG_ERROR("%s: failed to allocate backend buffer\n", __func__);
|
||||
ggml_free(ctx);
|
||||
ggml_backend_free(backend);
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
QNN_LOG_DEBUG("creating compute graph\n");
|
||||
gf = ggml_new_graph(ctx);
|
||||
ggml_build_forward_expand(gf, dst);
|
||||
|
||||
#if 0
|
||||
ggml_set_f32(src0, (rand() % 100 + 1));
|
||||
ggml_set_f32(src1, (rand() % 100 + 1));
|
||||
ggml_set_f32(dst, 0.0f);
|
||||
#else
|
||||
if (n_backend_type != QNN_BACKEND_GGML) {
|
||||
initialize_tensors(ctx);
|
||||
}
|
||||
#endif
|
||||
|
||||
ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr);
|
||||
if (get_tensor_data_size(dst) < (32 * 32)) {
|
||||
QNN_LOG_DEBUG("dump tensors:\n");
|
||||
TENSOR_DUMP(src0);
|
||||
TENSOR_DUMP(src1);
|
||||
TENSOR_DUMP(dst);
|
||||
} else {
|
||||
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
||||
src0->name,
|
||||
src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
|
||||
src0->nb[0], src0->nb[1], src0->nb[2]);
|
||||
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
||||
src1->name,
|
||||
src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
|
||||
src1->nb[0], src1->nb[1], src1->nb[2]);
|
||||
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
||||
dst->name,
|
||||
dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
|
||||
dst->nb[1], dst->nb[2]);
|
||||
}
|
||||
|
||||
ggml_free(ctx);
|
||||
ggml_backend_buffer_free(buffer);
|
||||
ggml_backend_free(backend);
|
||||
|
||||
n_end_time = ggml_time_us();
|
||||
n_duration = (n_end_time - n_begin_time) / 1000;
|
||||
QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration);
|
||||
QNN_LOG_DEBUG("backend %d, ggml op:%d(%s)", n_backend_type, n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type));
|
||||
qnn_op_ut(num_threads, n_backend_type, n_ggml_op_type);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue