use ggml_qnn_tensor_reader for output tensor
This commit is contained in:
parent
36e41a1055
commit
a5679ddd8e
1 changed files with 12 additions and 116 deletions
128
ggml-qnn.cpp
128
ggml-qnn.cpp
|
@ -2091,15 +2091,12 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
|
||||||
std::string graph_name = "ggml_op_qnn_add";
|
std::string graph_name = "ggml_op_qnn_add";
|
||||||
Qnn_GraphHandle_t graph_handle = nullptr;
|
Qnn_GraphHandle_t graph_handle = nullptr;
|
||||||
Qnn_Tensor_t * tensor_1 = nullptr;
|
Qnn_Tensor_t * tensor_1 = nullptr;
|
||||||
Qnn_Tensor_t * tensor_2 = nullptr;
|
|
||||||
Qnn_Param_t qnn_params[] = {};
|
Qnn_Param_t qnn_params[] = {};
|
||||||
enum ggml_op ggmlop = GGML_OP_ADD;
|
enum ggml_op ggmlop = GGML_OP_ADD;
|
||||||
Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32;
|
Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32;
|
||||||
Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32;
|
|
||||||
|
|
||||||
CHECK_PARAMS(ctx, src0, src1, dst);
|
CHECK_PARAMS(ctx, src0, src1, dst);
|
||||||
tensor_1 = (Qnn_Tensor_t *) src1->extra;
|
tensor_1 = (Qnn_Tensor_t *) src1->extra;
|
||||||
tensor_2 = (Qnn_Tensor_t *) dst->extra;
|
|
||||||
instance = ctx->instance;
|
instance = ctx->instance;
|
||||||
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
|
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
|
||||||
|
|
||||||
|
@ -2107,17 +2104,12 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
|
||||||
perf.start();
|
perf.start();
|
||||||
|
|
||||||
QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||||
QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
|
|
||||||
|
|
||||||
src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type);
|
src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type);
|
||||||
dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type);
|
|
||||||
|
|
||||||
uint32_t dimensions_input_1[] = {
|
uint32_t dimensions_input_1[] = {
|
||||||
(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2],
|
(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2],
|
||||||
(uint32_t) src1->ne[3]};
|
(uint32_t) src1->ne[3]};
|
||||||
uint32_t dimensions_output[] = {
|
|
||||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],
|
|
||||||
(uint32_t) dst->ne[3]};
|
|
||||||
|
|
||||||
std::string map_entry = std::string(ggml_op_name(ggmlop));
|
std::string map_entry = std::string(ggml_op_name(ggmlop));
|
||||||
if (instance->_qnn_graph_map.find(map_entry) !=
|
if (instance->_qnn_graph_map.find(map_entry) !=
|
||||||
|
@ -2128,7 +2120,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
|
uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
|
||||||
uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions;
|
|
||||||
|
|
||||||
if (!graph_initialized) {
|
if (!graph_initialized) {
|
||||||
graph_name = graph_name + "_" + std::to_string(ctx->threads) +
|
graph_name = graph_name + "_" + std::to_string(ctx->threads) +
|
||||||
|
@ -2190,9 +2181,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
|
||||||
if (ctx->device == QNN_BACKEND_NPU) {
|
if (ctx->device == QNN_BACKEND_NPU) {
|
||||||
QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
|
QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
|
||||||
QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0};
|
QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0};
|
||||||
|
|
||||||
QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
|
|
||||||
QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx);
|
ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx);
|
||||||
|
@ -2204,27 +2192,20 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
QNN_LOG_INFO("error = %d\n", error);
|
||||||
goto failure;
|
goto failure;
|
||||||
}
|
}
|
||||||
error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2);
|
ggml_qnn_tensor_reader tensor_reader(dst, graph_handle, ctx);
|
||||||
if (QNN_SUCCESS != error) {
|
if (!tensor_writer0.is_valid()) {
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
|
||||||
goto failure;
|
goto failure;
|
||||||
}
|
}
|
||||||
|
|
||||||
QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
|
QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
|
||||||
QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1);
|
QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1);
|
||||||
QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
|
QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
|
||||||
QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output;
|
|
||||||
QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst);
|
|
||||||
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
|
|
||||||
|
|
||||||
if (ctx->device != QNN_BACKEND_NPU) {
|
if (ctx->device != QNN_BACKEND_NPU) {
|
||||||
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
|
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
|
||||||
qnn_get_ggml_tensor_data_size(src1)};
|
qnn_get_ggml_tensor_data_size(src1)};
|
||||||
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
|
|
||||||
qnn_get_ggml_tensor_data_size(dst)};
|
|
||||||
} else {
|
} else {
|
||||||
uint8_t * qnn_buffer_1 = nullptr;
|
uint8_t * qnn_buffer_1 = nullptr;
|
||||||
uint8_t * qnn_buffer_2 = nullptr;
|
|
||||||
qnn_instance * instance = ctx->instance;
|
qnn_instance * instance = ctx->instance;
|
||||||
|
|
||||||
qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(
|
qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(
|
||||||
|
@ -2237,20 +2218,10 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
|
||||||
}
|
}
|
||||||
instance->register_rpcmem(qnn_buffer_1, tensor_1);
|
instance->register_rpcmem(qnn_buffer_1, tensor_1);
|
||||||
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
|
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
|
||||||
|
|
||||||
qnn_buffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(
|
|
||||||
ggml_nbytes(dst), 4));
|
|
||||||
if (nullptr == qnn_buffer_2) {
|
|
||||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
|
||||||
goto failure;
|
|
||||||
} else {
|
|
||||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
|
||||||
}
|
|
||||||
instance->register_rpcmem(qnn_buffer_2, tensor_2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1};
|
Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1};
|
||||||
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
|
Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()};
|
||||||
Qnn_OpConfig_t op_config = {
|
Qnn_OpConfig_t op_config = {
|
||||||
(Qnn_OpConfigVersion_t) 1,
|
(Qnn_OpConfigVersion_t) 1,
|
||||||
.v1 = {"ggml_op_add",
|
.v1 = {"ggml_op_add",
|
||||||
|
@ -2285,38 +2256,25 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
|
||||||
goto failure;
|
goto failure;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx->device == QNN_BACKEND_NPU) {
|
auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_reader.get_qnn_tensor());
|
||||||
uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
|
|
||||||
QNN_VER_PTR(*tensor_2)->memHandle));
|
|
||||||
memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
|
|
||||||
}
|
|
||||||
auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_2);
|
|
||||||
instance->_qnn_graph_map[map_entry] = graph_item;
|
instance->_qnn_graph_map[map_entry] = graph_item;
|
||||||
} else {
|
} else {
|
||||||
auto & graph_item = instance->_qnn_graph_map[map_entry];
|
auto & graph_item = instance->_qnn_graph_map[map_entry];
|
||||||
ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx);
|
ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx);
|
||||||
tensor_1 = std::get<2>(graph_item);
|
tensor_1 = std::get<2>(graph_item);
|
||||||
tensor_2 = std::get<3>(graph_item);
|
ggml_qnn_tensor_reader tensor_reader(dst, std::get<3>(graph_item), ctx);
|
||||||
|
|
||||||
uint32_t dimensions_input_1[] = {
|
uint32_t dimensions_input_1[] = {
|
||||||
(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
|
(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
|
||||||
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
|
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
|
||||||
uint32_t dimensions_output[] = {
|
|
||||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],
|
|
||||||
(uint32_t) dst->ne[3]};
|
|
||||||
|
|
||||||
QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
|
QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
|
||||||
QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1);
|
QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1);
|
||||||
QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
|
QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
|
||||||
QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output;
|
|
||||||
QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst);
|
|
||||||
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
|
|
||||||
|
|
||||||
if (ctx->device != QNN_BACKEND_NPU) {
|
if (ctx->device != QNN_BACKEND_NPU) {
|
||||||
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
|
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
|
||||||
qnn_get_ggml_tensor_data_size(src1)};
|
qnn_get_ggml_tensor_data_size(src1)};
|
||||||
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
|
|
||||||
qnn_get_ggml_tensor_data_size(dst)};
|
|
||||||
} else {
|
} else {
|
||||||
uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
|
uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
|
||||||
QNN_VER_PTR(*tensor_1)->memHandle));
|
QNN_VER_PTR(*tensor_1)->memHandle));
|
||||||
|
@ -2325,7 +2283,7 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
|
||||||
}
|
}
|
||||||
|
|
||||||
Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1};
|
Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1};
|
||||||
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
|
Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()};
|
||||||
error = qnn_raw_interface.graphExecute(graph_handle,
|
error = qnn_raw_interface.graphExecute(graph_handle,
|
||||||
tensor_inputs,2,
|
tensor_inputs,2,
|
||||||
tensor_outputs,1,
|
tensor_outputs,1,
|
||||||
|
@ -2339,19 +2297,11 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
QNN_LOG_INFO("error = %d\n", error);
|
||||||
goto failure;
|
goto failure;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx->device == QNN_BACKEND_NPU) {
|
|
||||||
uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
|
|
||||||
QNN_VER_PTR(*tensor_2)->memHandle));
|
|
||||||
if (nullptr != qnn_buffer_2)
|
|
||||||
memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
failure:
|
failure:
|
||||||
if (QNN_SUCCESS != error) {
|
if (QNN_SUCCESS != error) {
|
||||||
QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1));
|
QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1));
|
||||||
QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2));
|
|
||||||
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
|
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
|
||||||
" x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
" x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
||||||
src0->name, src0->type, ggml_type_name(src0->type),
|
src0->name, src0->type, ggml_type_name(src0->type),
|
||||||
|
@ -2370,7 +2320,6 @@ failure:
|
||||||
}
|
}
|
||||||
|
|
||||||
QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions;
|
QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions;
|
||||||
QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions;
|
|
||||||
|
|
||||||
perf.info();
|
perf.info();
|
||||||
}
|
}
|
||||||
|
@ -2395,15 +2344,12 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
||||||
std::string graph_name = "ggml_op_qnn_mul_mat";
|
std::string graph_name = "ggml_op_qnn_mul_mat";
|
||||||
Qnn_GraphHandle_t graph_handle = nullptr;
|
Qnn_GraphHandle_t graph_handle = nullptr;
|
||||||
Qnn_Tensor_t * tensor_1 = nullptr;
|
Qnn_Tensor_t * tensor_1 = nullptr;
|
||||||
Qnn_Tensor_t * tensor_2 = nullptr;
|
|
||||||
Qnn_Param_t qnn_params[] = {};
|
Qnn_Param_t qnn_params[] = {};
|
||||||
enum ggml_op ggmlop = GGML_OP_MUL_MAT;
|
enum ggml_op ggmlop = GGML_OP_MUL_MAT;
|
||||||
Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32;
|
Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32;
|
||||||
Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32;
|
|
||||||
|
|
||||||
CHECK_PARAMS(ctx, src0, src1, dst);
|
CHECK_PARAMS(ctx, src0, src1, dst);
|
||||||
tensor_1 = (Qnn_Tensor_t *) src1->extra;
|
tensor_1 = (Qnn_Tensor_t *) src1->extra;
|
||||||
tensor_2 = (Qnn_Tensor_t *) dst->extra;
|
|
||||||
instance = ctx->instance;
|
instance = ctx->instance;
|
||||||
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
|
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
|
||||||
|
|
||||||
|
@ -2411,21 +2357,15 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
||||||
perf.start();
|
perf.start();
|
||||||
|
|
||||||
tensor_1 = (Qnn_Tensor_t *) src1->extra;
|
tensor_1 = (Qnn_Tensor_t *) src1->extra;
|
||||||
tensor_2 = (Qnn_Tensor_t *) dst->extra;
|
|
||||||
instance = ctx->instance;
|
instance = ctx->instance;
|
||||||
|
|
||||||
QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||||
QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
|
|
||||||
|
|
||||||
src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type);
|
src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type);
|
||||||
dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type);
|
|
||||||
|
|
||||||
uint32_t dimensions_input_1[] = {
|
uint32_t dimensions_input_1[] = {
|
||||||
(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2],
|
(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2],
|
||||||
(uint32_t) src1->ne[3]};
|
(uint32_t) src1->ne[3]};
|
||||||
uint32_t dimensions_output[] = {
|
|
||||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],
|
|
||||||
(uint32_t) dst->ne[3]};
|
|
||||||
|
|
||||||
std::string map_entry = std::string(ggml_op_name(ggmlop));
|
std::string map_entry = std::string(ggml_op_name(ggmlop));
|
||||||
if (instance->_qnn_graph_map.find(map_entry) !=
|
if (instance->_qnn_graph_map.find(map_entry) !=
|
||||||
|
@ -2436,7 +2376,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
|
uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
|
||||||
uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions;
|
|
||||||
|
|
||||||
//TODO: for scenarios of quantized data in src0
|
//TODO: for scenarios of quantized data in src0
|
||||||
// pass-1: dequantize src0 to FP32
|
// pass-1: dequantize src0 to FP32
|
||||||
|
@ -2500,9 +2439,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
||||||
if (ctx->device == QNN_BACKEND_NPU) {
|
if (ctx->device == QNN_BACKEND_NPU) {
|
||||||
QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
|
QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
|
||||||
QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0};
|
QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0};
|
||||||
|
|
||||||
QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
|
|
||||||
QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx);
|
ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx);
|
||||||
|
@ -2514,27 +2450,20 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
QNN_LOG_INFO("error = %d\n", error);
|
||||||
goto failure;
|
goto failure;
|
||||||
}
|
}
|
||||||
error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2);
|
ggml_qnn_tensor_reader tensor_reader(dst, graph_handle, ctx);
|
||||||
if (QNN_SUCCESS != error) {
|
if (!tensor_writer0.is_valid()) {
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
|
||||||
goto failure;
|
goto failure;
|
||||||
}
|
}
|
||||||
|
|
||||||
QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
|
QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
|
||||||
QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1);
|
QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1);
|
||||||
QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
|
QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
|
||||||
QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output;
|
|
||||||
QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst);
|
|
||||||
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
|
|
||||||
|
|
||||||
if (ctx->device != QNN_BACKEND_NPU) {
|
if (ctx->device != QNN_BACKEND_NPU) {
|
||||||
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
|
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
|
||||||
qnn_get_ggml_tensor_data_size(src1)};
|
qnn_get_ggml_tensor_data_size(src1)};
|
||||||
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
|
|
||||||
qnn_get_ggml_tensor_data_size(dst)};
|
|
||||||
} else {
|
} else {
|
||||||
uint8_t * qnn_buffer_1 = nullptr;
|
uint8_t * qnn_buffer_1 = nullptr;
|
||||||
uint8_t * qnn_buffer_2 = nullptr;
|
|
||||||
qnn_instance * instance = ctx->instance;
|
qnn_instance * instance = ctx->instance;
|
||||||
|
|
||||||
qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(
|
qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(
|
||||||
|
@ -2547,20 +2476,10 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
||||||
}
|
}
|
||||||
instance->register_rpcmem(qnn_buffer_1, tensor_1);
|
instance->register_rpcmem(qnn_buffer_1, tensor_1);
|
||||||
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
|
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
|
||||||
|
|
||||||
qnn_buffer_2 = static_cast<uint8_t *>(instance->alloc_rpcmem(
|
|
||||||
ggml_nbytes(dst), 4));
|
|
||||||
if (nullptr == qnn_buffer_2) {
|
|
||||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
|
||||||
goto failure;
|
|
||||||
} else {
|
|
||||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
|
||||||
}
|
|
||||||
instance->register_rpcmem(qnn_buffer_2, tensor_2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1};
|
Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1};
|
||||||
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
|
Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()};
|
||||||
Qnn_OpConfig_t op_config = {
|
Qnn_OpConfig_t op_config = {
|
||||||
(Qnn_OpConfigVersion_t) 1,
|
(Qnn_OpConfigVersion_t) 1,
|
||||||
.v1 = {"ggml_op_mul_mat",
|
.v1 = {"ggml_op_mul_mat",
|
||||||
|
@ -2595,38 +2514,24 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
||||||
goto failure;
|
goto failure;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx->device == QNN_BACKEND_NPU) {
|
auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_reader.get_qnn_tensor());
|
||||||
uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
|
|
||||||
QNN_VER_PTR(*tensor_2)->memHandle));
|
|
||||||
memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
|
|
||||||
}
|
|
||||||
|
|
||||||
auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_2);
|
|
||||||
instance->_qnn_graph_map[map_entry] = graph_item;
|
instance->_qnn_graph_map[map_entry] = graph_item;
|
||||||
} else {
|
} else {
|
||||||
auto & graph_item= instance->_qnn_graph_map[map_entry];
|
auto & graph_item= instance->_qnn_graph_map[map_entry];
|
||||||
ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx);
|
ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx);
|
||||||
tensor_1 = std::get<2>(graph_item);
|
tensor_1 = std::get<2>(graph_item);
|
||||||
tensor_2 = std::get<3>(graph_item);
|
ggml_qnn_tensor_reader tensor_reader(dst, std::get<3>(graph_item), ctx);
|
||||||
|
|
||||||
uint32_t dimensions_input_1[] = {
|
uint32_t dimensions_input_1[] = {
|
||||||
(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
|
(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
|
||||||
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
|
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
|
||||||
uint32_t dimensions_output[] = {
|
|
||||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],
|
|
||||||
(uint32_t) dst->ne[3]};
|
|
||||||
QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
|
QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
|
||||||
QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1);
|
QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1);
|
||||||
QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
|
QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
|
||||||
QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output;
|
|
||||||
QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst);
|
|
||||||
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
|
|
||||||
|
|
||||||
if (ctx->device != QNN_BACKEND_NPU) {
|
if (ctx->device != QNN_BACKEND_NPU) {
|
||||||
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
|
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
|
||||||
qnn_get_ggml_tensor_data_size(src1)};
|
qnn_get_ggml_tensor_data_size(src1)};
|
||||||
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
|
|
||||||
qnn_get_ggml_tensor_data_size(dst)};
|
|
||||||
} else {
|
} else {
|
||||||
uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
|
uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
|
||||||
QNN_VER_PTR(*tensor_1)->memHandle));
|
QNN_VER_PTR(*tensor_1)->memHandle));
|
||||||
|
@ -2635,7 +2540,7 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
||||||
}
|
}
|
||||||
|
|
||||||
Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1};
|
Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1};
|
||||||
Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
|
Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()};
|
||||||
error = qnn_raw_interface.graphExecute(graph_handle,
|
error = qnn_raw_interface.graphExecute(graph_handle,
|
||||||
tensor_inputs, 2,
|
tensor_inputs, 2,
|
||||||
tensor_outputs, 1,
|
tensor_outputs, 1,
|
||||||
|
@ -2649,19 +2554,11 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx,
|
||||||
QNN_LOG_INFO("error = %d\n", error);
|
QNN_LOG_INFO("error = %d\n", error);
|
||||||
goto failure;
|
goto failure;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx->device == QNN_BACKEND_NPU) {
|
|
||||||
uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
|
|
||||||
QNN_VER_PTR(*tensor_2)->memHandle));
|
|
||||||
if (nullptr != qnn_buffer_2)
|
|
||||||
memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
failure:
|
failure:
|
||||||
if (QNN_SUCCESS != error) {
|
if (QNN_SUCCESS != error) {
|
||||||
QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1));
|
QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1));
|
||||||
QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2));
|
|
||||||
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
|
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
|
||||||
" x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
" x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
|
||||||
src0->name, src0->type, ggml_type_name(src0->type),
|
src0->name, src0->type, ggml_type_name(src0->type),
|
||||||
|
@ -2679,7 +2576,6 @@ failure:
|
||||||
}
|
}
|
||||||
|
|
||||||
QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions;
|
QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions;
|
||||||
QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions;
|
|
||||||
perf.info();
|
perf.info();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue