use tensor wrapper in add

This commit is contained in:
hongruichen 2024-06-15 11:13:30 +08:00
parent 6c68adc1d9
commit 37bb9263dd

View file

@ -1960,10 +1960,10 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx,
} }
template <Qnn_TensorType_t _tensorType> template <Qnn_TensorType_t _tensorType>
class ggml_qnn_tensor_binder class ggml_qnn_tensor_readwrite
{ {
public: public:
ggml_qnn_tensor_binder(const ggml_tensor *tensor, ggml_backend_qnn_context * ctx, Qnn_GraphHandle_t graph_handle) ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_GraphHandle_t graph_handle, ggml_backend_qnn_context * ctx)
: _tensor(tensor) : _tensor(tensor)
, _qnn_tensor(reinterpret_cast<Qnn_Tensor_t *>(tensor->extra)) , _qnn_tensor(reinterpret_cast<Qnn_Tensor_t *>(tensor->extra))
, _context(ctx) { , _context(ctx) {
@ -1979,6 +1979,7 @@ public:
auto err = ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor); auto err = ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor);
if (err != QNN_SUCCESS) { if (err != QNN_SUCCESS) {
QNN_LOG_INFO("error = %d\n", err); QNN_LOG_INFO("error = %d\n", err);
QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor));
_context = nullptr; _context = nullptr;
return; return;
} }
@ -1998,7 +1999,9 @@ public:
ggml_nbytes(tensor), 4)); // TODO: should we get the align param from device here? ggml_nbytes(tensor), 4)); // TODO: should we get the align param from device here?
if (!qnn_buffer) { if (!qnn_buffer) {
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor));
_context = nullptr; _context = nullptr;
// TODO: should we free the tensor here?
return; return;
} else { } else {
QNN_LOG_INFO("alloc rpcmem successfully\n"); QNN_LOG_INFO("alloc rpcmem successfully\n");
@ -2014,7 +2017,7 @@ public:
} }
} }
ggml_qnn_tensor_binder(const ggml_tensor *tensor, Qnn_Tensor_t *qnn_tensor, ggml_backend_qnn_context * ctx) ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_Tensor_t *qnn_tensor, ggml_backend_qnn_context * ctx)
: _tensor(tensor) : _tensor(tensor)
, _qnn_tensor(qnn_tensor) , _qnn_tensor(qnn_tensor)
, _context(ctx) { , _context(ctx) {
@ -2038,6 +2041,9 @@ public:
memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor));
} else { } else {
QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n");
QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor));
_context = nullptr;
return;
} }
} else { } else {
QNN_VER_PTR(*_qnn_tensor)->clientBuf = {tensor->data, QNN_VER_PTR(*_qnn_tensor)->clientBuf = {tensor->data,
@ -2045,7 +2051,7 @@ public:
} }
} }
~ggml_qnn_tensor_binder() { ~ggml_qnn_tensor_readwrite() {
if (_context && _context->device == QNN_BACKEND_NPU && if (_context && _context->device == QNN_BACKEND_NPU &&
(_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ)) { (_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ)) {
uint8_t * qnn_buffer = static_cast<uint8_t *>(_context->instance->get_rpcmem_from_memhandle( uint8_t * qnn_buffer = static_cast<uint8_t *>(_context->instance->get_rpcmem_from_memhandle(
@ -2056,6 +2062,9 @@ public:
QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions; QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions;
} }
bool is_valid() const { return _context; }
Qnn_Tensor_t * get_qnn_tensor() const { return _qnn_tensor; }
private: private:
const ggml_tensor *_tensor; const ggml_tensor *_tensor;
Qnn_Tensor_t *_qnn_tensor; Qnn_Tensor_t *_qnn_tensor;
@ -2063,12 +2072,15 @@ private:
uint32_t *_old_dimensions; uint32_t *_old_dimensions;
uint32_t _dimensions[4] = {}; uint32_t _dimensions[4] = {};
ggml_qnn_tensor_binder(const ggml_qnn_tensor_binder&) = delete; ggml_qnn_tensor_readwrite(const ggml_qnn_tensor_readwrite&) = delete;
ggml_qnn_tensor_binder(ggml_qnn_tensor_binder&&) = delete; void operator=(const ggml_qnn_tensor_readwrite&) = delete;
void operator=(const ggml_qnn_tensor_binder&) = delete; ggml_qnn_tensor_readwrite(ggml_qnn_tensor_readwrite&&) = delete;
void operator=(ggml_qnn_tensor_binder&&) = delete; void operator=(ggml_qnn_tensor_readwrite&&) = delete;
}; };
using ggml_qnn_tensor_reader = ggml_qnn_tensor_readwrite<QNN_TENSOR_TYPE_APP_READ>;
using ggml_qnn_tensor_writer = ggml_qnn_tensor_readwrite<QNN_TENSOR_TYPE_APP_WRITE>;
//TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat //TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat
// keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC // keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC
static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
@ -2078,17 +2090,14 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
qnn_instance * instance = nullptr; qnn_instance * instance = nullptr;
std::string graph_name = "ggml_op_qnn_add"; std::string graph_name = "ggml_op_qnn_add";
Qnn_GraphHandle_t graph_handle = nullptr; Qnn_GraphHandle_t graph_handle = nullptr;
Qnn_Tensor_t * tensor_0 = nullptr;
Qnn_Tensor_t * tensor_1 = nullptr; Qnn_Tensor_t * tensor_1 = nullptr;
Qnn_Tensor_t * tensor_2 = nullptr; Qnn_Tensor_t * tensor_2 = nullptr;
Qnn_Param_t qnn_params[] = {}; Qnn_Param_t qnn_params[] = {};
enum ggml_op ggmlop = GGML_OP_ADD; enum ggml_op ggmlop = GGML_OP_ADD;
Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32;
Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32;
Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32;
CHECK_PARAMS(ctx, src0, src1, dst); CHECK_PARAMS(ctx, src0, src1, dst);
tensor_0 = (Qnn_Tensor_t *) src0->extra;
tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_1 = (Qnn_Tensor_t *) src1->extra;
tensor_2 = (Qnn_Tensor_t *) dst->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra;
instance = ctx->instance; instance = ctx->instance;
@ -2097,17 +2106,12 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
qnn_perf perf("ggml_qnn_add"); qnn_perf perf("ggml_qnn_add");
perf.start(); perf.start();
QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type);
src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type);
dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type);
uint32_t dimensions_input_0[] = {
(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2],
(uint32_t) src0->ne[3]};
uint32_t dimensions_input_1[] = { uint32_t dimensions_input_1[] = {
(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2],
(uint32_t) src1->ne[3]}; (uint32_t) src1->ne[3]};
@ -2123,7 +2127,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
graph_handle = std::get<0>(graph_item); graph_handle = std::get<0>(graph_item);
} }
uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions;
uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions;
@ -2185,9 +2188,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
} }
if (ctx->device == QNN_BACKEND_NPU) { if (ctx->device == QNN_BACKEND_NPU) {
QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0};
QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0};
@ -2195,9 +2195,8 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0};
} }
error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx);
if (QNN_SUCCESS != error) { if (!tensor_writer0.is_valid()) {
QNN_LOG_INFO("error = %d\n", error);
goto failure; goto failure;
} }
error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1);
@ -2211,9 +2210,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
goto failure; goto failure;
} }
QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0);
QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1);
QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
@ -2222,29 +2218,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
if (ctx->device != QNN_BACKEND_NPU) { if (ctx->device != QNN_BACKEND_NPU) {
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data,
qnn_get_ggml_tensor_data_size(src0)};
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
qnn_get_ggml_tensor_data_size(src1)}; qnn_get_ggml_tensor_data_size(src1)};
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
qnn_get_ggml_tensor_data_size(dst)}; qnn_get_ggml_tensor_data_size(dst)};
} else { } else {
uint8_t * qnn_buffer_0 = nullptr;
uint8_t * qnn_buffer_1 = nullptr; uint8_t * qnn_buffer_1 = nullptr;
uint8_t * qnn_buffer_2 = nullptr; uint8_t * qnn_buffer_2 = nullptr;
qnn_instance * instance = ctx->instance; qnn_instance * instance = ctx->instance;
qnn_buffer_0 = static_cast<uint8_t *>(instance->alloc_rpcmem(
ggml_nbytes(src0), 4));
if (nullptr == qnn_buffer_0) {
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
goto failure;
} else {
QNN_LOG_INFO("alloc rpcmem successfully\n");
}
instance->register_rpcmem(qnn_buffer_0, tensor_0);
memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem( qnn_buffer_1 = static_cast<uint8_t *>(instance->alloc_rpcmem(
ggml_nbytes(src1), 4)); ggml_nbytes(src1), 4));
if (nullptr == qnn_buffer_1) { if (nullptr == qnn_buffer_1) {
@ -2267,7 +2249,7 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
instance->register_rpcmem(qnn_buffer_2, tensor_2); instance->register_rpcmem(qnn_buffer_2, tensor_2);
} }
Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1};
Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
Qnn_OpConfig_t op_config = { Qnn_OpConfig_t op_config = {
(Qnn_OpConfigVersion_t) 1, (Qnn_OpConfigVersion_t) 1,
@ -2308,18 +2290,14 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
QNN_VER_PTR(*tensor_2)->memHandle)); QNN_VER_PTR(*tensor_2)->memHandle));
memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
} }
auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_2);
instance->_qnn_graph_map[map_entry] = graph_item; instance->_qnn_graph_map[map_entry] = graph_item;
} else { } else {
auto & graph_item = instance->_qnn_graph_map[map_entry]; auto & graph_item = instance->_qnn_graph_map[map_entry];
graph_handle = std::get<0>(graph_item); ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx);
tensor_0 = std::get<1>(graph_item);
tensor_1 = std::get<2>(graph_item); tensor_1 = std::get<2>(graph_item);
tensor_2 = std::get<3>(graph_item); tensor_2 = std::get<3>(graph_item);
uint32_t dimensions_input_0[] = {
(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
(uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
uint32_t dimensions_input_1[] = { uint32_t dimensions_input_1[] = {
(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
@ -2327,9 +2305,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],
(uint32_t) dst->ne[3]}; (uint32_t) dst->ne[3]};
QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0);
QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1);
QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
@ -2338,25 +2313,18 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
if (ctx->device != QNN_BACKEND_NPU) { if (ctx->device != QNN_BACKEND_NPU) {
QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data,
qnn_get_ggml_tensor_data_size(src0)};
QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data,
qnn_get_ggml_tensor_data_size(src1)}; qnn_get_ggml_tensor_data_size(src1)};
QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,
qnn_get_ggml_tensor_data_size(dst)}; qnn_get_ggml_tensor_data_size(dst)};
} else { } else {
uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
QNN_VER_PTR(*tensor_0)->memHandle));
if (nullptr != qnn_buffer_0)
memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle( uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(
QNN_VER_PTR(*tensor_1)->memHandle)); QNN_VER_PTR(*tensor_1)->memHandle));
if (nullptr != qnn_buffer_1) if (nullptr != qnn_buffer_1)
memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
} }
Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1};
Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2};
error = qnn_raw_interface.graphExecute(graph_handle, error = qnn_raw_interface.graphExecute(graph_handle,
tensor_inputs,2, tensor_inputs,2,
@ -2382,7 +2350,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src
failure: failure:
if (QNN_SUCCESS != error) { if (QNN_SUCCESS != error) {
QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0));
QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1));
QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2));
QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64
@ -2402,7 +2369,6 @@ failure:
dst->nb[1], dst->nb[2]); dst->nb[1], dst->nb[2]);
} }
QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions;
QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions;
QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions;