From 3b47056c97a01fd176ce46ce969b95d71884919f Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 22 Jul 2024 12:45:26 +0800 Subject: [PATCH] refactoring: change the tensor binding mode between qnn tensor and ggml tensor --- ggml/src/ggml-qnn.cpp | 36 +----- ggml/src/ggml-qnn/backend-ops.cpp | 126 ++++----------------- ggml/src/ggml-qnn/backend-ops.hpp | 4 +- ggml/src/ggml-qnn/backend.hpp | 6 +- ggml/src/ggml-qnn/graph.hpp | 124 ++++++++++++++------- ggml/src/ggml-qnn/qnn-lib.hpp | 2 +- ggml/src/ggml-qnn/tensor.hpp | 175 +++++++++++++++--------------- ggml/src/ggml-qnn/utils.hpp | 109 +++++++++++-------- 8 files changed, 261 insertions(+), 321 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 46718af09..87653cfb1 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -87,30 +87,12 @@ public: } ~ggml_backend_qnn_buffer_context() { - _tensors.clear(); - // the free will do nothing if the _buffer is nullptr qnn::align_free(_buffer); } bool is_valid() const { return _buffer != nullptr; } - bool init_tensor(ggml_tensor *tensor) { - if (qnn::ggml_qnn_tensor::from_ggml_tensor(tensor)) { - QNN_LOG_INFO("tensor %s already initialized", tensor->name); - return true; - } - - auto qnn_tensor = std::make_unique(tensor, _device, _instance); - if (!qnn_tensor->is_valid()) { - QNN_LOG_WARN("create ggml_qnn_tensor failed"); - return false; - } - - _tensors.push_back(std::move(qnn_tensor)); - return true; - } - void *get_buffer() { return _buffer; } size_t get_buffer_size() { return _buffer_size; } @@ -118,7 +100,6 @@ private: QNNBackend _device; std::shared_ptr _instance; std::string _name; - std::list> _tensors; void *_buffer = nullptr; size_t _buffer_size = 0; }; @@ -175,12 +156,9 @@ GGML_CALL static void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t bu } GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { - ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - - if (!ctx->init_tensor(tensor)) { - QNN_LOG_WARN("init ggml_qnn_tensor failed"); - return; - } + // Do nothing here, the qnn tensor will be create along with the graph. + GGML_UNUSED(buffer); + GGML_UNUSED(tensor); } GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, @@ -271,13 +249,7 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { auto instance = g_qnn_mgr[ctx->device].instance; if (instance) { - ctx->qnn_unary_graph_cache.clear(); - for (const auto &graph_item : ctx->qnn_binary_graph_cache) { - QNN_LOG_INFO("graph type:%s", graph_item.first.c_str()); - } - - ctx->qnn_binary_graph_cache.clear(); - + ctx->qnn_graph_cache.clear(); instance->qnn_finalize(); g_qnn_mgr[ctx->device].instance.reset(); } diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 6896454aa..bd87cfc9e 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -19,10 +19,8 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src, } auto instance = ctx->instance; - auto *tensor0 = qnn::ggml_qnn_tensor::from_ggml_tensor(src); - auto *tensor1 = qnn::ggml_qnn_tensor::from_ggml_tensor(dst); - if (!instance || !tensor0 || !tensor1) { - QNN_LOG_WARN("invalid tensors\n"); + if (!instance) { + QNN_LOG_WARN("invalid instance\n"); return false; } @@ -37,11 +35,8 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, } auto instance = ctx->instance; - auto *tensor0 = qnn::ggml_qnn_tensor::from_ggml_tensor(src0); - auto *tensor1 = qnn::ggml_qnn_tensor::from_ggml_tensor(src1); - auto *tensor2 = qnn::ggml_qnn_tensor::from_ggml_tensor(dst); - if (!instance || !tensor0 || !tensor1 || !tensor2) { - QNN_LOG_WARN("invalid tensors\n"); + if (!instance) { + QNN_LOG_WARN("invalid instance\n"); return false; } @@ -67,104 +62,29 @@ void print_ggml_tensor(const ggml_tensor *tensor) { tensor->nb[0], tensor->nb[1], tensor->nb[2]); } -template -bool qnn_bind_tensors_to_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, const std::string &op_name, - const std::array &inputs, - const std::array &outputs) { - std::array qnn_input_tensors; - for (size_t i = 0; i < inputs.size(); ++i) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(inputs[i]); - if (!tensor || !tensor->bind_to_graph(*graph, true)) { - return false; - } - - qnn_input_tensors[i] = tensor->get_qnn_tensor(); - } - - std::array qnn_output_tensors; - for (size_t i = 0; i < outputs.size(); ++i) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(outputs[i]); - if (!tensor || !tensor->bind_to_graph(*graph, false)) { - return false; - } - - qnn_output_tensors[i] = tensor->get_qnn_tensor(); - } - - if (!graph->add_nodes(op_name, qnn_input_tensors, qnn_output_tensors)) { - return false; - } - - return true; +template +qnn::ggml_tensor_array_t to_ggml_tensor_array(const std::array &array) { + return qnn::ggml_tensor_array_t(array.data(), array.data() + _Size); } template -bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, - const std::array &inputs, +bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array &inputs, const std::array &outputs) { - - std::array qnn_input_tensors; - for (size_t i = 0; i < inputs.size(); ++i) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(inputs[i]); - if (!tensor || !tensor->write_to_qnn_tensor()) { - QNN_LOG_WARN("write_to_qnn_tensor failed\n"); - return false; - } - - qnn_input_tensors[i] = tensor->get_qnn_tensor(); - } - - std::array qnn_output_tensors; - for (size_t i = 0; i < outputs.size(); ++i) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(outputs[i]); - if (!tensor) { - return false; - } - - qnn_output_tensors[i] = tensor->get_qnn_tensor(); - } - - if (!graph->execute(qnn_input_tensors, qnn_output_tensors)) { + if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<_OutputSize>(outputs))) { QNN_LOG_WARN("execute failed\n"); return false; } - for (auto &output : outputs) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(output); - if (!tensor || !tensor->read_from_qnn_tensor()) { - QNN_LOG_WARN("read_from_qnn_tensors failed\n"); - return false; - } - } - return true; } -qnn::ggml_qnn_unary_graph_cache_t &get_qnn_graph_cache(ggml_backend_qnn_context *ctx, - const std::array &inputs, - const std::array &outputs) { - GGML_UNUSED(inputs); - GGML_UNUSED(outputs); - return ctx->qnn_unary_graph_cache; -} - -qnn::ggml_qnn_binary_graph_cache_t &get_qnn_graph_cache(ggml_backend_qnn_context *ctx, - const std::array &inputs, - const std::array &outputs) { - GGML_UNUSED(inputs); - GGML_UNUSED(outputs); - return ctx->qnn_binary_graph_cache; -} - template -qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache( - ggml_backend_qnn_context *ctx, size_t op, const std::string &qnn_op, - const std::array &inputs, const std::array &outputs) { - using graph_t = qnn::ggml_qnn_graph<_InputSize, _OutputSize>; - +qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, size_t op, const std::string &qnn_op, + const std::array &inputs, + const std::array &outputs) { GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); - auto &graph_cache = get_qnn_graph_cache(ctx, inputs, outputs); + auto &graph_cache = ctx->qnn_graph_cache; const auto *op_name = op < qnn::kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - qnn::kGgmlUnaryOpStart)); std::string graph_key(op_name); @@ -178,21 +98,21 @@ qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache( } auto it = graph_cache.find(graph_key); - graph_t *graph_ptr = nullptr; + qnn::ggml_qnn_graph *graph_ptr = nullptr; if (it != graph_cache.end()) { QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str()); graph_ptr = it->second.get(); } else { - auto graph = - std::make_unique(graph_key, (QNNBackend)(ctx->device), ctx->instance->get_qnn_context_handle(), - ctx->qnn_interface, ctx->socinfo.vtcm_size_in_mb); + auto graph = std::make_unique(graph_key, (QNNBackend)(ctx->device), ctx->instance, + ctx->socinfo.vtcm_size_in_mb); if (!graph->is_valid()) { return nullptr; } - if (!qnn_bind_tensors_to_graph<_InputSize, _OutputSize>(graph.get(), qnn_op.c_str(), inputs, outputs)) { - QNN_LOG_ERROR("qnn_bind_tensors_to_graph failed\n"); + if (!graph->build_graph(qnn_op, to_ggml_tensor_array<_InputSize>(inputs), + to_ggml_tensor_array<_OutputSize>(outputs))) { + QNN_LOG_ERROR("build_graph failed\n"); return nullptr; } @@ -309,15 +229,13 @@ static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + qnn::kGgmlUnaryOpStart] != nul "GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU"); template -bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { +bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) { static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); CHECK_PARAMS(ctx, src0, src1, dst); bool succeed = false; - qnn::ggml_qnn_graph_binary *graph_ptr = - get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src0, src1 }, { dst }); + auto *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src0, src1 }, { dst }); if (graph_ptr) { succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); } @@ -332,7 +250,7 @@ bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, } template -bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { +bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_tensor *dst) { static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); CHECK_PARAMS(ctx, src, dst); diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index 8cc2dc366..614bcf651 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -6,8 +6,8 @@ namespace qnn { -typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst); -typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, +typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_tensor *dst); +typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst); typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT + GGML_UNARY_OP_COUNT]; diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index 32f3c6cd4..b2f93a8f7 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -12,8 +12,7 @@ #include "qnn-lib.hpp" namespace qnn { -typedef std::unordered_map> ggml_qnn_unary_graph_cache_t; -typedef std::unordered_map> ggml_qnn_binary_graph_cache_t; +typedef std::unordered_map> ggml_qnn_graph_cache_t; } // namespace qnn struct ggml_backend_qnn_context { @@ -25,8 +24,7 @@ struct ggml_backend_qnn_context { qnn::qcom_socinfo socinfo = {}; std::shared_ptr instance; std::shared_ptr qnn_interface; - qnn::ggml_qnn_unary_graph_cache_t qnn_unary_graph_cache; - qnn::ggml_qnn_binary_graph_cache_t qnn_binary_graph_cache; + qnn::ggml_qnn_graph_cache_t qnn_graph_cache; explicit ggml_backend_qnn_context(int device, int threads, const char *name, const char *lib) : device(device), threads(threads) { diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 30f96a994..9941365f7 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -1,27 +1,29 @@ #pragma once -#include +#include #include +#include #include "ggml-qnn.h" #include "logger.hpp" #include "qnn-lib.hpp" +#include "tensor.hpp" namespace qnn { -template +using ggml_tensor_array_t = std::vector; + class ggml_qnn_graph { public: - typedef std::array input_tensor_array_t; - typedef std::array output_tensor_array_t; - - explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, Qnn_ContextHandle_t qnn_context, - std::shared_ptr qnn_interface, size_t vtcm_size_in_mb) : - _graph_name(graph_name), _device(device), _qnn_interface(qnn_interface) { + explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, + std::shared_ptr qnn_instance, size_t vtcm_size_in_mb) : + _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { QNN_LOG_INFO("graph name %s", graph_name.c_str()); + auto qnn_interface = qnn_instance->get_qnn_interface(); + auto qnn_context = qnn_instance->get_qnn_context_handle(); Qnn_ErrorHandle_t error = QNN_SUCCESS; Qnn_GraphHandle_t graph_handle = nullptr; if (device == QNN_BACKEND_NPU) { @@ -72,34 +74,52 @@ public: QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); _graph_handle = graph_handle; + _qnn_interface = qnn_interface; } - bool create_graph_tensor(Qnn_Tensor_t &tensor) { - if (!is_valid()) { - QNN_LOG_ERROR("Invalid graph\n"); - return false; - } + ~ggml_qnn_graph() { QNN_LOG_DEBUG("graph name %s, destroy", _graph_name.c_str()); } - auto err = _qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &tensor); - if (err != QNN_SUCCESS) { - QNN_LOG_INFO("error = %d\n", err); - QNN_LOG_DEBUG("tensor%p name %s", &tensor, QNN_TENSOR_GET_NAME(tensor)); - return false; - } - - return true; - } - - bool add_nodes(const std::string &op_name, const input_tensor_array_t &tensor_inputs, - const output_tensor_array_t &tensor_outputs) { + bool build_graph(const std::string &op_name, const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) { if (!is_valid()) { QNN_LOG_ERROR("Invalid graph\n"); return false; } QNN_LOG_DEBUG("graph name %s, add_nodes start", _graph_name.c_str()); - _tensor_inputs = tensor_inputs; - _tensor_outputs = tensor_outputs; + _qnn_tensor_inputs.resize(tensor_inputs.size()); + _tensor_inputs.resize(tensor_inputs.size()); + for (size_t i = 0; i < tensor_inputs.size(); i++) { + char buffer[GGML_MAX_NAME] = {}; + snprintf(buffer, GGML_MAX_NAME, "src%d", (int)i); + auto qnn_tensor = + std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); + auto *ggml_tensor = tensor_inputs[i]; + if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, true)) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + + _qnn_tensor_inputs[i] = qnn_tensor->get_qnn_tensor(); + _tensor_inputs[i] = qnn_tensor; + } + + _qnn_tensor_outputs.resize(tensor_outputs.size()); + _tensor_outputs.resize(tensor_outputs.size()); + for (size_t i = 0; i < tensor_outputs.size(); i++) { + char buffer[GGML_MAX_NAME] = {}; + snprintf(buffer, GGML_MAX_NAME, "dst%d", (int)i); + auto qnn_tensor = + std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); + auto *ggml_tensor = tensor_inputs[i]; + if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, false)) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + + _qnn_tensor_outputs[i] = qnn_tensor->get_qnn_tensor(); + _tensor_outputs[i] = qnn_tensor; + } Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; config.version = QNN_OPCONFIG_VERSION_1; @@ -109,10 +129,10 @@ public: op_config.typeName = op_name.c_str(); op_config.numOfParams = (uint32_t)_param_types.size(); op_config.params = _param_types.data(); - op_config.numOfInputs = (uint32_t)_tensor_inputs.size(); - op_config.inputTensors = _tensor_inputs.data(); - op_config.numOfOutputs = (uint32_t)_tensor_outputs.size(); - op_config.outputTensors = _tensor_outputs.data(); + op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size(); + op_config.inputTensors = _qnn_tensor_inputs.data(); + op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size(); + op_config.outputTensors = _qnn_tensor_outputs.data(); auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, config); if (error != QNN_SUCCESS) { auto *error_str = get_qnn_error_string(error); @@ -139,12 +159,32 @@ public: return true; } - bool execute(const input_tensor_array_t &tensor_inputs, const output_tensor_array_t &tensor_outputs) { - _tensor_inputs = tensor_inputs; - _tensor_outputs = tensor_outputs; + bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { + GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size()); + GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size()); + for (size_t i = 0; i < tensor_inputs.size(); i++) { + auto *ggml_tensor = tensor_inputs[i]; + if (!_tensor_inputs[i]->bind_ggml_tensor(ggml_tensor, true)) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + + _qnn_tensor_inputs[i] = _tensor_inputs[i]->get_qnn_tensor(); + } + + for (size_t i = 0; i < tensor_outputs.size(); i++) { + auto *ggml_tensor = tensor_inputs[i]; + if (!_tensor_outputs[i]->bind_ggml_tensor(ggml_tensor, false)) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + + _qnn_tensor_outputs[i] = _tensor_outputs[i]->get_qnn_tensor(); + } + auto error = - _qnn_interface->qnn_graph_execute(_graph_handle, _tensor_inputs.data(), _tensor_inputs.size(), - _tensor_outputs.data(), _tensor_outputs.size(), nullptr, nullptr); + _qnn_interface->qnn_graph_execute(_graph_handle, _qnn_tensor_inputs.data(), _qnn_tensor_inputs.size(), + _qnn_tensor_outputs.data(), _qnn_tensor_outputs.size(), nullptr, nullptr); if (_device == QNN_BACKEND_NPU) { if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); @@ -168,10 +208,13 @@ public: private: const std::string _graph_name; const QNNBackend _device; - std::shared_ptr _qnn_interface; Qnn_GraphHandle_t _graph_handle = nullptr; - std::array _tensor_inputs; - std::array _tensor_outputs; + std::shared_ptr _qnn_instance; + std::shared_ptr _qnn_interface; + std::vector> _tensor_inputs; + std::vector> _tensor_outputs; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; std::vector _param_types; ggml_qnn_graph(const ggml_qnn_graph &) = delete; @@ -180,7 +223,4 @@ private: void operator=(ggml_qnn_graph &&) = delete; }; -using ggml_qnn_graph_binary = ggml_qnn_graph<2, 1>; -using ggml_qnn_graph_unary = ggml_qnn_graph<1, 1>; - } // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index 517df493c..4e1dcb34c 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -366,7 +366,7 @@ public: size_t probe_slots[] = { 1024, 1536, 2048 - 48, 2048 }; size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); for (size_t idx = 0; idx < probe_counts; idx++) { - rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, 4)); + rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, sizeof(void *))); if (!rpc_buffer) { QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); break; diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 49e9258c3..5e45266b4 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -10,7 +10,6 @@ #include "QnnTensor.h" #include "System/QnnSystemInterface.h" -#include "graph.hpp" #include "logger.hpp" #include "qnn-lib.hpp" #include "utils.hpp" @@ -19,68 +18,47 @@ namespace qnn { class ggml_qnn_tensor { public: - static ggml_qnn_tensor *from_ggml_tensor(const ggml_tensor *tensor) { - if (!tensor) { - return nullptr; - } - - return static_cast(tensor->extra); - } - - explicit ggml_qnn_tensor(ggml_tensor *tensor, QNNBackend device, std::shared_ptr qnn_instance) : - _tensor(tensor), _device(device), _qnn_instance(qnn_instance) { - update_tensor_name(); + explicit ggml_qnn_tensor(const std::string &name, QNNBackend device, Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance) : + _tensor_name(name), _device(device), _qnn_instance(qnn_instance), _graph_handle(graph_handle) { QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); - _dimensions[0] = (uint32_t)tensor->ne[0]; - _dimensions[1] = (uint32_t)tensor->ne[1]; - _dimensions[2] = (uint32_t)tensor->ne[2]; - _dimensions[3] = (uint32_t)tensor->ne[3]; - QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions); - auto qnn_tensor_type = device_tensortype_from_ggml_tensor(tensor); - QNN_TENSOR_SET_TYPE(_qnn_tensor, qnn_tensor_type); + QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data()); + QNN_TENSOR_SET_TYPE(_qnn_tensor, QNN_TENSOR_TYPE_NATIVE); QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); - QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type)); - // TODO: set the quantizeParams base on the tensor type - QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)ggml_n_dims(tensor)); - - QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = {}; - QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); - - tensor->extra = this; - QNN_LOG_DEBUG("create tensor %s, device: %d, qnn_type: %d", _tensor_name.c_str(), device, (int)qnn_tensor_type); + QNN_LOG_DEBUG("create tensor %s, device: %d", _tensor_name.c_str(), device); } - template - bool bind_to_graph(ggml_qnn_graph<_InputSize, _OutputSize> &graph, bool is_input) { - if (!is_valid()) { - QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str()); - return false; - } - - if (_graph_handle) { - if (_graph_handle != graph.get_graph_handler()) { - QNN_LOG_WARN("tensor %s has been bound to another graph", _tensor_name.c_str()); + bool bind_ggml_tensor(ggml_tensor *tensor, bool is_input) { + if (_tensor) { + if (_tensor != tensor) { + QNN_LOG_WARN("tensor %s has been bound to another ggml tensor %s", _tensor_name.c_str(), + ggml_get_name(_tensor)); return false; } else { - QNN_LOG_INFO("tensor %s already bound to same graph %s", _tensor_name.c_str(), - graph.get_name().c_str()); + QNN_LOG_INFO("tensor %s already bound to same ggml tensor %s", _tensor_name.c_str(), + ggml_get_name(_tensor)); return true; } } + update_params_from_ggml_tensor(tensor); Qnn_TensorType_t new_tensor_type = is_input ? QNN_TENSOR_TYPE_APP_WRITE : QNN_TENSOR_TYPE_APP_READ; QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type); - update_tensor_name(); - Qnn_Tensor_t tensor = _qnn_tensor; - if (!graph.create_graph_tensor(tensor)) { - QNN_LOG_WARN("create graph tensor failed, tensor %s", _tensor_name.c_str()); - return false; + + if (!QNN_TENSOR_GET_ID(_qnn_tensor)) { + Qnn_Tensor_t qnn_tensor = _qnn_tensor; + auto qnn_interface = _qnn_instance->get_qnn_interface(); + auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("create graph tensor failed, tensor %s, error: %d\n", _tensor_name.c_str(), error); + return false; + } + QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor)); } if (should_use_mem_handle()) { - _qnn_rpc_buffer = alloc_rpc_mem(); + _qnn_rpc_buffer = alloc_rpc_mem(ggml_nbytes(tensor)); if (!_qnn_rpc_buffer) { QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); return false; @@ -89,28 +67,59 @@ public: QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = { _tensor->data, get_ggml_tensor_data_size(_tensor) }; + Qnn_ClientBuffer_t client_buf = { tensor->data, get_ggml_tensor_data_size(tensor) }; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); QNN_LOG_DEBUG("tensor %s, use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, (int)client_buf.dataSize); } - QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(tensor)); - _graph_handle = graph.get_graph_handler(); + _tensor = tensor; - QNN_LOG_DEBUG("bind tensor %s to graph %s", _tensor_name.c_str(), graph.get_name().c_str()); - return true; - } - - bool write_to_qnn_tensor() { - if (!is_valid()) { - QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str()); + if (!write_to_qnn_tensor()) { + QNN_LOG_WARN("write to qnn tensor failed, tensor %s", _tensor_name.c_str()); return false; } + QNN_LOG_DEBUG("bind tensor %s to ggml tensor %s", _tensor_name.c_str(), ggml_get_name(tensor)); + return true; + } + + bool unbind_ggml_tensor() { + if (!_graph_handle) { + QNN_LOG_WARN("tensor %s not bound to any graph", _tensor_name.c_str()); + return false; + } + + if (!_tensor) { + QNN_LOG_DEBUG("tensor %s not bound to ggml tensor", _tensor_name.c_str()); + return true; + } + + if (!read_from_qnn_tensor()) { + QNN_LOG_WARN("read from qnn tensor failed, tensor %s", _tensor_name.c_str()); + return false; + } + + if (!should_use_mem_handle()) { + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = {}; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + QNN_LOG_DEBUG("tensor %s, clear client buffer", _tensor_name.c_str()); + } + + _tensor = nullptr; + QNN_LOG_DEBUG("unbind tensor: %s from ggml tensor: %s", _tensor_name.c_str(), _tensor->name); + return true; + } + + const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } + +private: + bool write_to_qnn_tensor() { auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); if (tensor_type != QNN_TENSOR_TYPE_APP_WRITE && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { - QNN_LOG_WARN("tensor %s type(%d) not WRITE", _tensor_name.c_str(), (int)tensor_type); + QNN_LOG_DEBUG("tensor %s type(%d) not WRITE", _tensor_name.c_str(), (int)tensor_type); + return true; } if (should_use_mem_handle()) { @@ -128,14 +137,10 @@ public: } bool read_from_qnn_tensor() { - if (!is_valid()) { - QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str()); - return false; - } - auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); if (tensor_type != QNN_TENSOR_TYPE_APP_READ && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { - QNN_LOG_WARN("tensor %s type(%d) not READ", _tensor_name.c_str(), (int)tensor_type); + QNN_LOG_DEBUG("tensor %s type(%d) not READ", _tensor_name.c_str(), (int)tensor_type); + return true; } if (should_use_mem_handle()) { @@ -152,13 +157,8 @@ public: return true; } - bool is_valid() const { return _tensor; } - const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } - -private: - uint8_t *alloc_rpc_mem() { - uint8_t *qnn_rpc_buffer = - static_cast(_qnn_instance->alloc_rpcmem(ggml_nbytes(_tensor), alignof(void *))); + uint8_t *alloc_rpc_mem(size_t bytes) { + uint8_t *qnn_rpc_buffer = static_cast(_qnn_instance->alloc_rpcmem(bytes, alignof(void *))); if (!qnn_rpc_buffer) { QNN_LOG_WARN("alloc rpc mem failure, %s\n", strerror(errno)); QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); @@ -180,29 +180,28 @@ private: return qnn_rpc_buffer; } - bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; } + void update_params_from_ggml_tensor(ggml_tensor *tensor) { + _dimensions[0] = (uint32_t)tensor->ne[0]; + _dimensions[1] = (uint32_t)tensor->ne[1]; + _dimensions[2] = (uint32_t)tensor->ne[2]; + _dimensions[3] = (uint32_t)tensor->ne[3]; + QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type)); + // TODO: set the quantizeParams base on the tensor type + QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)ggml_n_dims(tensor)); - void update_tensor_name() { - auto *tensor_name = ggml_get_name(_tensor); - if (!strnlen(tensor_name, GGML_MAX_NAME)) { - if (_tensor_name.empty()) { - static std::atomic_uint32_t unnamed_tensor_count = 0; - char buffer[GGML_MAX_NAME] = {}; - snprintf(buffer, sizeof(buffer), "unnamed_%d", (int)(unnamed_tensor_count++)); - _tensor_name = buffer; - } - } else { - QNN_LOG_DEBUG("tensor name changed: %s -> %s", _tensor_name.c_str(), tensor_name); - _tensor_name = tensor_name; - } + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = {}; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); } + bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; } + + std::string _tensor_name; const ggml_tensor *_tensor; QNNBackend _device; std::shared_ptr _qnn_instance; - Qnn_Tensor_t _qnn_tensor = QNN_TENSOR_INIT; - uint32_t _dimensions[4] = {}; - std::string _tensor_name; + Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); + std::array _dimensions = {}; Qnn_GraphHandle_t _graph_handle = nullptr; uint8_t *_qnn_rpc_buffer = nullptr; diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index e91a5ae87..c2da6cb27 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -13,6 +13,8 @@ #include "QnnTypes.h" #include "logger.hpp" +#define QNN_TENSOR_VER(x) ((x).v2) + namespace qnn { uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor); @@ -29,149 +31,159 @@ const char *opname_from_ggmlop(enum ggml_op ggmlop); const char *get_qnn_error_string(Qnn_ErrorHandle_t error); -inline int validate_tensor_version(const Qnn_Tensor_t &tensor) { - if (tensor.version != QNN_TENSOR_VERSION_1) { - QNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n", tensor.v1.name, - tensor.version); - return 1; +constexpr const Qnn_TensorVersion_t kDefaultQnnTensorVersion = QNN_TENSOR_VERSION_2; + +inline Qnn_Tensor_t qnn_tensor_init(Qnn_TensorVersion_t version) { + Qnn_Tensor_t tensor; + tensor.version = version; + if (version == QNN_TENSOR_VERSION_1) { + tensor.v1 = QNN_TENSOR_V1_INIT; + } else if (version == QNN_TENSOR_VERSION_2) { + tensor.v2 = QNN_TENSOR_V2_INIT; } - return 0; + return tensor; } inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.id; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).id; } return 0u; } inline const char *get_qnn_tensorname(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.name; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).name; } return nullptr; } inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.type; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).type; } return QNN_TENSOR_TYPE_UNDEFINED; } inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dataFormat; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).dataFormat; } return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; } inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dataType; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).dataType; } return QNN_DATATYPE_UNDEFINED; } inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.quantizeParams; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).quantizeParams; } return QNN_QUANTIZE_PARAMS_INIT; } inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.rank; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).rank; } return 0u; } inline uint32_t *get_qnn_tensor_dimensions(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dimensions; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).dimensions; } return nullptr; } inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.memType; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).memType; } return QNN_TENSORMEMTYPE_UNDEFINED; } inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.memHandle; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).memHandle; } return nullptr; } inline void set_qnn_tensor_id(Qnn_Tensor_t &tensor, uint32_t id) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.id = id; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).id = id; } } inline void set_qnn_tensor_name(Qnn_Tensor_t &tensor, const char *name) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.name = name; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).name = name; } } inline void set_qnn_tensor_type(Qnn_Tensor_t &tensor, Qnn_TensorType_t type) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.type = type; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).type = type; } } inline void set_qnn_tensor_dataformat(Qnn_Tensor_t &tensor, Qnn_TensorDataFormat_t format) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dataFormat = format; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).dataFormat = format; } } inline void set_qnn_tensor_datatype(Qnn_Tensor_t &tensor, Qnn_DataType_t dataType) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dataType = dataType; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).dataType = dataType; } } inline void set_qnn_tensor_quantparams(Qnn_Tensor_t &tensor, Qnn_QuantizeParams_t params) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.quantizeParams = params; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).quantizeParams = params; } } inline void set_qnn_tensor_rank(Qnn_Tensor_t &tensor, uint32_t rank) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.rank = rank; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).rank = rank; } } inline void set_qnn_tensor_dimensions(Qnn_Tensor_t &tensor, uint32_t *dims) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dimensions = dims; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).dimensions = dims; } } inline void set_qnn_tensor_memtype(Qnn_Tensor_t &tensor, Qnn_TensorMemType_t mem_type) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.memType = mem_type; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).memType = mem_type; } } inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t &tensor, Qnn_ClientBuffer_t client_buf) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.clientBuf = client_buf; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).clientBuf = client_buf; } } inline void set_qnn_tensor_memhandle(Qnn_Tensor_t &tensor, Qnn_MemHandle_t handle) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.memHandle = handle; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).memHandle = handle; + } +} + +inline void set_qnn_tensor_dyn_dimensions(Qnn_Tensor_t &tensor, uint8_t *isDynamicDimensions) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).isDynamicDimensions = isDynamicDimensions; } } @@ -239,3 +251,4 @@ public: #define QNN_TENSOR_SET_MEM_TYPE(tensor, value) qnn::set_qnn_tensor_memtype(tensor, value) #define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) qnn::set_qnn_tensor_clientbuf(tensor, value) #define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) qnn::set_qnn_tensor_memhandle(tensor, value) +#define QNN_TENSOR_SET_DYN_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dyn_dimensions(tensor, value)