From 6c68adc1d942a5a0173b537237656a4220e7487b Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 14 Jun 2024 18:52:54 +0800 Subject: [PATCH] add ggml_qnn_tensor_binder --- ggml-qnn.cpp | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index f268c7f0e..62fee4281 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1959,6 +1959,116 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, return true; } +template +class ggml_qnn_tensor_binder +{ +public: + ggml_qnn_tensor_binder(const ggml_tensor *tensor, ggml_backend_qnn_context * ctx, Qnn_GraphHandle_t graph_handle) + : _tensor(tensor) + , _qnn_tensor(reinterpret_cast(tensor->extra)) + , _context(ctx) { + _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; + const auto qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); + const bool is_npu = ctx->device == QNN_BACKEND_NPU; + QNN_VER_PTR(*_qnn_tensor)->type = _tensorType; + if (is_npu) { + QNN_VER_PTR(*_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*_qnn_tensor)->clientBuf= {.data=nullptr, .dataSize=0}; + } + + auto err = ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor); + if (err != QNN_SUCCESS) { + QNN_LOG_INFO("error = %d\n", err); + _context = nullptr; + return; + } + + _dimensions[0] = (uint32_t)tensor->ne[0]; + _dimensions[1] = (uint32_t)tensor->ne[1]; + _dimensions[2] = (uint32_t)tensor->ne[2]; + _dimensions[3] = (uint32_t)tensor->ne[3]; + QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; + QNN_VER_PTR(*_qnn_tensor)->rank = qnn_get_ggml_tensor_rank(tensor); + QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; + + + if (is_npu) { + qnn_instance * instance = ctx->instance; + uint8_t *qnn_buffer = static_cast(instance->alloc_rpcmem( + ggml_nbytes(tensor), 4)); // TODO: should we get the align param from device here? + if (!qnn_buffer) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + _context = nullptr; + return; + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + + instance->register_rpcmem(qnn_buffer, _qnn_tensor); + if (_tensorType == QNN_TENSOR_TYPE_APP_WRITE || _tensorType == QNN_TENSOR_TYPE_APP_READWRITE) { + memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); + } + } else { + QNN_VER_PTR(*_qnn_tensor)->clientBuf = {tensor->data, + qnn_get_ggml_tensor_data_size(tensor)}; + } + } + + ggml_qnn_tensor_binder(const ggml_tensor *tensor, Qnn_Tensor_t *qnn_tensor, ggml_backend_qnn_context * ctx) + : _tensor(tensor) + , _qnn_tensor(qnn_tensor) + , _context(ctx) { + _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; + const auto qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); + const bool is_npu = ctx->device == QNN_BACKEND_NPU; + + _dimensions[0] = (uint32_t)tensor->ne[0]; + _dimensions[1] = (uint32_t)tensor->ne[1]; + _dimensions[2] = (uint32_t)tensor->ne[2]; + _dimensions[3] = (uint32_t)tensor->ne[3]; + QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; + QNN_VER_PTR(*_qnn_tensor)->rank = qnn_get_ggml_tensor_rank(tensor); + QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; + + + if (is_npu) { + uint8_t * qnn_buffer = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*_qnn_tensor)->memHandle)); + if (qnn_buffer) { + memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); + } else { + QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); + } + } else { + QNN_VER_PTR(*_qnn_tensor)->clientBuf = {tensor->data, + qnn_get_ggml_tensor_data_size(tensor)}; + } + } + + ~ggml_qnn_tensor_binder() { + if (_context && _context->device == QNN_BACKEND_NPU && + (_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ)) { + uint8_t * qnn_buffer = static_cast(_context->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*_qnn_tensor)->memHandle)); + memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); + } + + QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions; + } + +private: + const ggml_tensor *_tensor; + Qnn_Tensor_t *_qnn_tensor; + ggml_backend_qnn_context *_context; + uint32_t *_old_dimensions; + uint32_t _dimensions[4] = {}; + + ggml_qnn_tensor_binder(const ggml_qnn_tensor_binder&) = delete; + ggml_qnn_tensor_binder(ggml_qnn_tensor_binder&&) = delete; + void operator=(const ggml_qnn_tensor_binder&) = delete; + void operator=(ggml_qnn_tensor_binder&&) = delete; +}; + //TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat // keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,