diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 1914e64dc..bafe5ca16 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -66,44 +66,43 @@ bool qnn_bind_tensors_to_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *gra return true; } -template -bool write_to_qnn_tensors(const std::array &inputs) { - for (auto &input : inputs) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(input); - if (!tensor || !tensor->write_to_qnn_tensor()) { - return false; - } - } - - return true; -} - -template -bool read_from_qnn_tensors(const std::array &outputs) { - for (auto &output : outputs) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(output); - if (!tensor || !tensor->read_from_qnn_tensor()) { - return false; - } - } - - return true; -} - template bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, const std::array &inputs, const std::array &outputs) { - if (!write_to_qnn_tensors<_InputSize>(inputs)) { + + std::array qnn_input_tensors; + for (size_t i = 0; i < inputs.size(); ++i) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(inputs[i]); + if (!tensor || !tensor->write_to_qnn_tensor()) { + QNN_LOG_WARN("write_to_qnn_tensor failed\n"); + return false; + } + + qnn_input_tensors[i] = tensor->get_qnn_tensor(); + } + + std::array qnn_output_tensors; + for (size_t i = 0; i < outputs.size(); ++i) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(outputs[i]); + if (!tensor) { + return false; + } + + qnn_output_tensors[i] = tensor->get_qnn_tensor(); + } + + if (!graph->execute(qnn_input_tensors, qnn_output_tensors)) { + QNN_LOG_WARN("execute failed\n"); return false; } - if (!graph->execute()) { - return false; - } - - if (!read_from_qnn_tensors<_OutputSize>(outputs)) { - return false; + for (auto &output : outputs) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(output); + if (!tensor || !tensor->read_from_qnn_tensor()) { + QNN_LOG_WARN("read_from_qnn_tensors failed\n"); + return false; + } } return true; diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 01c44fe37..cb04b1efd 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -96,6 +96,7 @@ public: return false; } + QNN_LOG_DEBUG("graph name %s, add_nodes start", _graph_name.c_str()); _tensor_inputs = tensor_inputs; _tensor_outputs = tensor_outputs; @@ -116,10 +117,13 @@ public: return false; } + QNN_LOG_DEBUG("graph name %s, add_nodes succeed", _graph_name.c_str()); return true; } - bool execute() { + bool execute(const input_tensor_array_t &tensor_inputs, const output_tensor_array_t &tensor_outputs) { + _tensor_inputs = tensor_inputs; + _tensor_outputs = tensor_outputs; auto error = _qnn_interface.graphExecute(_graph_handle, _tensor_inputs.data(), _tensor_inputs.size(), _tensor_outputs.data(), _tensor_outputs.size(), nullptr, nullptr); if (_device == QNN_BACKEND_NPU) { diff --git a/ggml/src/ggml-qnn/qnn.hpp b/ggml/src/ggml-qnn/qnn.hpp index 400ce005b..9d60d2f6c 100644 --- a/ggml/src/ggml-qnn/qnn.hpp +++ b/ggml/src/ggml-qnn/qnn.hpp @@ -661,13 +661,12 @@ public: QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), strerror(error)); return 6; - } else { - QNN_LOG_INFO("tensor %s successfully register shared memory\n", QNN_TENSOR_GET_NAME(*p_tensor)); } QNN_TENSOR_SET_MEM_HANDLE(*p_tensor, handle); _qnn_mem_set.insert((std::pair(p_data, handle))); + QNN_LOG_INFO("tensor %s successfully register shared memory handler: %p\n", QNN_TENSOR_GET_NAME(*p_tensor), handle); return 0; } diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index e966e638b..aeab60569 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -49,14 +49,9 @@ public: // TODO: set the quantizeParams base on the tensor type QNN_TENSOR_SET_RANK(_qnn_tensor, qnn::get_ggml_tensor_rank(tensor)); - if (should_use_mem_handle()) { - QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); - QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, nullptr); - } else { - QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = { tensor->data, get_ggml_tensor_data_size(tensor) }; - QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); - } + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = {}; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); tensor->extra = this; QNN_LOG_DEBUG("create tensor %s with device %d", _tensor_name.c_str(), device); @@ -86,9 +81,26 @@ public: return false; } - if (!alloc_rpc_mem()) { - QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); - return false; + if (should_use_mem_handle()) { + _qnn_rpc_buffer = alloc_rpc_mem(); + if (!_qnn_rpc_buffer) { + QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); + return false; + } + + auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); + if (!register_rpc_mem(_qnn_rpc_buffer)) { + QNN_LOG_WARN("commit rpc mem failure\n"); + return false; + } + + QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); + } else { + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = { _tensor->data, get_ggml_tensor_data_size(_tensor) }; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + QNN_LOG_DEBUG("tensor %s, use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, + (int)client_buf.dataSize); } QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(tensor)); @@ -111,10 +123,8 @@ public: } if (should_use_mem_handle()) { - uint8_t *qnn_buffer = static_cast( - _qnn_instance->get_rpcmem_from_memhandle(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor))); - if (qnn_buffer) { - memcpy(qnn_buffer, _tensor->data, ggml_nbytes(_tensor)); + if (_qnn_rpc_buffer) { + memcpy(_qnn_rpc_buffer, _tensor->data, ggml_nbytes(_tensor)); } else { QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); return false; @@ -122,6 +132,7 @@ public: } // For CPU and GPU, the data is already in the tensor. + QNN_LOG_DEBUG("write tensor %s to qnn", _tensor_name.c_str()); return true; } @@ -138,10 +149,8 @@ public: } if (should_use_mem_handle()) { - uint8_t *qnn_buffer = static_cast( - _qnn_instance->get_rpcmem_from_memhandle(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor))); - if (qnn_buffer) { - memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); + if (_qnn_rpc_buffer) { + memcpy(_tensor->data, _qnn_rpc_buffer, ggml_nbytes(_tensor)); } else { QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); return false; @@ -149,6 +158,7 @@ public: } // For CPU and GPU, the data is already in the tensor. + QNN_LOG_DEBUG("read tensor %s from qnn", _tensor_name.c_str()); return true; } @@ -156,28 +166,35 @@ public: const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } private: - bool alloc_rpc_mem() { - if (!should_use_mem_handle()) { + uint8_t *alloc_rpc_mem() { + uint8_t *qnn_rpc_buffer = + static_cast(_qnn_instance->alloc_rpcmem(ggml_nbytes(_tensor), alignof(void *))); + if (!qnn_rpc_buffer) { + QNN_LOG_WARN("alloc rpc mem failure, %s\n", strerror(errno)); + QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); + return nullptr; + } + + QNN_LOG_INFO("tensor %s: alloc rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_rpc_buffer); + return qnn_rpc_buffer; + } + + bool register_rpc_mem(uint8_t *qnn_rpc_buffer) { + if (_qnn_instance->is_rpcmem_registered(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor))) { + QNN_LOG_INFO("tensor %s: rpcmem(%p) already registered\n", _tensor_name.c_str(), qnn_rpc_buffer); return true; } - uint8_t *qnn_buffer = - static_cast(_qnn_instance->alloc_rpcmem(ggml_nbytes(_tensor), alignof(void *))); - if (!qnn_buffer) { - QNN_LOG_WARN("alloc rpc mem failure, %s\n", strerror(errno)); - QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); - return false; - } - - QNN_LOG_INFO("tensor %s: alloc rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_buffer); - - auto error = _qnn_instance->register_rpcmem(qnn_buffer, &_qnn_tensor); + auto error = _qnn_instance->register_rpcmem(qnn_rpc_buffer, &_qnn_tensor); if (error != QNN_SUCCESS) { QNN_LOG_WARN("register rpc mem failure, %d\n", (int)error); QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); return false; } + // The mem handle will be set at qnn_instance::register_rpcmem + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); + QNN_LOG_INFO("tensor %s: register rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_rpc_buffer); return true; } @@ -190,6 +207,7 @@ private: uint32_t _dimensions[4] = {}; std::string _tensor_name; Qnn_GraphHandle_t _graph_handle = nullptr; + uint8_t *_qnn_rpc_buffer = nullptr; ggml_qnn_tensor(const ggml_qnn_tensor &) = delete; void operator=(const ggml_qnn_tensor &) = delete;